/*
 * Copyright (c) 2019-2020 Arm Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#ifdef __aarch64__

#include <algorithm>

#include "arm_gemm.hpp"


#include "../../asmlib.hpp"
#include "../../utils.hpp"

namespace arm_gemm {

void a64_smallK_hybrid_fp32_mla_6x4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) {
    const long loops_count = iceildiv(N, (int)4) - 1;
    const long ldab = lda * sizeof(float);
    const long ldcb = ldc * sizeof(float);
    float nullbias[4];
    if (!bias) {
        memset(nullbias, 0, (4 * sizeof(float)));
    }
    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
    const float * const minptr = &minval;
    const float * const maxptr = &maxval;

    switch(act.type)
    {
        default:
        case Activation::Type::None:
            break;
        case Activation::Type::BoundedReLU:
            maxval = static_cast<float>(act.param1);
            /* fall through */
        case Activation::Type::ReLU:
            minval = 0.0f;
            break;
    }

    for (int y0=0; y0<M; y0+=6) {
        long loops = loops_count;
        long oob_rows = std::max(6 - (M-y0), 0);
        const float *b_ptr0 = B;
        const float *biasptr = bias ? bias : nullbias;
        const uint64_t biasinc = bias ? 4*sizeof(float) : 0;
        const float *a_ptr0 = A + (y0 * lda);

        float *c_ptr0 = C + (y0 * ldc);

        switch(K) {
            case 9:
                __asm __volatile (
                    "a_ptr1 .req X0\n"
                    "a_ptr2 .req X1\n"
                    "a_ptr3 .req X2\n"
                    "a_ptr4 .req X3\n"
                    "a_ptr5 .req X4\n"
                    "c_ptr1 .req X5\n"
                    "c_ptr2 .req X6\n"
                    "c_ptr3 .req X7\n"
                    "c_ptr4 .req X8\n"
                    "c_ptr5 .req X9\n"
                    "add a_ptr1, %[a_ptr0], %[lda]\n"
                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
                    "add a_ptr2, a_ptr1, %[lda]\n"
                    "add c_ptr2, c_ptr1, %[ldc]\n"
                    "add a_ptr3, a_ptr2, %[lda]\n"
                    "add c_ptr3, c_ptr2, %[ldc]\n"
                    "add a_ptr4, a_ptr3, %[lda]\n"
                    "add c_ptr4, c_ptr3, %[ldc]\n"
                    "add a_ptr5, a_ptr4, %[lda]\n"
                    "add c_ptr5, c_ptr4, %[ldc]\n"
                    "cbz %[oob_rows], 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr5, %[c_ptr0], #0x0\n"
                    "add a_ptr5, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr4, %[c_ptr0], #0x0\n"
                    "add a_ptr4, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr3, %[c_ptr0], #0x0\n"
                    "add a_ptr3, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr2, %[c_ptr0], #0x0\n"
                    "add a_ptr2, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr1, %[c_ptr0], #0x0\n"
                    "add a_ptr1, %[a_ptr0], #0x0\n"
                    "1:\n"
                    "ldr q0, [%[a_ptr0]], #0x10\n"
                    "ldr q3, [a_ptr1], #0x10\n"
                    "ldr q6, [a_ptr2], #0x10\n"
                    "ldr q9, [a_ptr3], #0x10\n"
                    "ldr q12, [a_ptr4], #0x10\n"
                    "ldr q15, [a_ptr5], #0x10\n"
                    "ldr q1, [%[a_ptr0]], #0x10\n"
                    "ldr q4, [a_ptr1], #0x10\n"
                    "ldr q7, [a_ptr2], #0x10\n"
                    "ldr q10, [a_ptr3], #0x10\n"
                    "ldr s2, [%[a_ptr0]]\n"
                    "ldr q13, [a_ptr4], #0x10\n"
                    "ldr s5, [a_ptr1]\n"
                    "ldr q16, [a_ptr5], #0x10\n"
                    "ldr s8, [a_ptr2]\n"
                    "ldr q18, [%[b_ptr0]]\n"
                    "ldr s11, [a_ptr3]\n"
                    "ldr q19, [%[b_ptr0], #0x10]\n"
                    "ldr s14, [a_ptr4]\n"
                    "ldr q20, [%[b_ptr0], #0x20]\n"
                    "ldr s17, [a_ptr5]\n"
                    "ldr q21, [%[b_ptr0], #0x30]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                    "ldr q22, [%[b_ptr0], #0x40]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                    "ldr q23, [%[b_ptr0], #0x50]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                    "ldr q24, [%[b_ptr0], #0x60]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                    "ldr q25, [%[b_ptr0], #0x70]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                    "cbz %[loops], 2f\n"
                    "ldr q26, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "mov v27.16b, v26.16b\n"
                    "mov v28.16b, v26.16b\n"
                    "mov v29.16b, v26.16b\n"
                    "mov v30.16b, v26.16b\n"
                    "mov v31.16b, v26.16b\n"
                    "fmla v26.4s, v18.4s, v0.s[0]\n"
                    "fmla v27.4s, v18.4s, v3.s[0]\n"
                    "fmla v28.4s, v18.4s, v6.s[0]\n"
                    "fmla v29.4s, v18.4s, v9.s[0]\n"
                    "fmla v30.4s, v18.4s, v12.s[0]\n"
                    "fmla v31.4s, v18.4s, v15.s[0]\n"
                    "ldr q18, [%[b_ptr0]]\n"
                    "fmla v26.4s, v19.4s, v0.s[1]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                    "fmla v27.4s, v19.4s, v3.s[1]\n"
                    "fmla v28.4s, v19.4s, v6.s[1]\n"
                    "fmla v29.4s, v19.4s, v9.s[1]\n"
                    "fmla v30.4s, v19.4s, v12.s[1]\n"
                    "fmla v31.4s, v19.4s, v15.s[1]\n"
                    "fmla v26.4s, v20.4s, v0.s[2]\n"
                    "fmla v27.4s, v20.4s, v3.s[2]\n"
                    "fmla v28.4s, v20.4s, v6.s[2]\n"
                    "fmla v29.4s, v20.4s, v9.s[2]\n"
                    "fmla v30.4s, v20.4s, v12.s[2]\n"
                    "fmla v31.4s, v20.4s, v15.s[2]\n"
                    "fmla v26.4s, v21.4s, v0.s[3]\n"
                    "fmla v27.4s, v21.4s, v3.s[3]\n"
                    "fmla v28.4s, v21.4s, v6.s[3]\n"
                    "fmla v29.4s, v21.4s, v9.s[3]\n"
                    "fmla v30.4s, v21.4s, v12.s[3]\n"
                    "fmla v31.4s, v21.4s, v15.s[3]\n"
                    "fmla v26.4s, v22.4s, v1.s[0]\n"
                    "fmla v27.4s, v22.4s, v4.s[0]\n"
                    "fmla v28.4s, v22.4s, v7.s[0]\n"
                    "fmla v29.4s, v22.4s, v10.s[0]\n"
                    "fmla v30.4s, v22.4s, v13.s[0]\n"
                    "fmla v31.4s, v22.4s, v16.s[0]\n"
                    "fmla v26.4s, v23.4s, v1.s[1]\n"
                    "fmla v27.4s, v23.4s, v4.s[1]\n"
                    "fmla v28.4s, v23.4s, v7.s[1]\n"
                    "fmla v29.4s, v23.4s, v10.s[1]\n"
                    "fmla v30.4s, v23.4s, v13.s[1]\n"
                    "fmla v31.4s, v23.4s, v16.s[1]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v7.s[2]\n"
                    "fmla v29.4s, v24.4s, v10.s[2]\n"
                    "fmla v30.4s, v24.4s, v13.s[2]\n"
                    "fmla v31.4s, v24.4s, v16.s[2]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v7.s[3]\n"
                    "fmla v29.4s, v25.4s, v10.s[3]\n"
                    "fmla v30.4s, v25.4s, v13.s[3]\n"
                    "fmla v31.4s, v25.4s, v16.s[3]\n"
                    "fmla v26.4s, v18.4s, v2.s[0]\n"
                    "fmla v27.4s, v18.4s, v5.s[0]\n"
                    "fmla v28.4s, v18.4s, v8.s[0]\n"
                    "fmla v29.4s, v18.4s, v11.s[0]\n"
                    "fmla v30.4s, v18.4s, v14.s[0]\n"
                    "fmla v31.4s, v18.4s, v17.s[0]\n"
                    "b.eq 3f\n"
                    "4:\n"
                    "ld1r {v24.4s}, [%[minptr]]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "ld1r {v25.4s}, [%[maxptr]]\n"
                    "ldr q18, [%[b_ptr0]]\n"
                    "fmax v26.4s, v26.4s, v24.4s\n"
                    "ldr q19, [%[b_ptr0], #0x10]\n"
                    "fmax v27.4s, v27.4s, v24.4s\n"
                    "ldr q20, [%[b_ptr0], #0x20]\n"
                    "fmax v28.4s, v28.4s, v24.4s\n"
                    "ldr q21, [%[b_ptr0], #0x30]\n"
                    "fmax v29.4s, v29.4s, v24.4s\n"
                    "ldr q22, [%[b_ptr0], #0x40]\n"
                    "fmin v26.4s, v26.4s, v25.4s\n"
                    "ldr q23, [%[b_ptr0], #0x50]\n"
                    "fmin v27.4s, v27.4s, v25.4s\n"
                    "fmin v28.4s, v28.4s, v25.4s\n"
                    "fmin v29.4s, v29.4s, v25.4s\n"
                    "str q26, [%[c_ptr0]]\n"
                    "fmax v30.4s, v30.4s, v24.4s\n"
                    "ldr q26, [%[biasptr]]\n"
                    "fmax v31.4s, v31.4s, v24.4s\n"
                    "ldr q24, [%[b_ptr0], #0x60]\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "str q27, [c_ptr1]\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v30.4s, v30.4s, v25.4s\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmin v31.4s, v31.4s, v25.4s\n"
                    "str q28, [c_ptr2]\n"
                    "mov v27.16b, v26.16b\n"
                    "ldr q25, [%[b_ptr0], #0x70]\n"
                    "mov v28.16b, v26.16b\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "str q29, [c_ptr3]\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "mov v29.16b, v26.16b\n"
                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                    "fmla v27.4s, v18.4s, v3.s[0]\n"
                    "str q30, [c_ptr4]\n"
                    "mov v30.16b, v26.16b\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "fmla v28.4s, v18.4s, v6.s[0]\n"
                    "str q31, [c_ptr5]\n"
                    "mov v31.16b, v26.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "fmla v26.4s, v18.4s, v0.s[0]\n"
                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                    "fmla v29.4s, v18.4s, v9.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                    "fmla v30.4s, v18.4s, v12.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                    "fmla v31.4s, v18.4s, v15.s[0]\n"
                    "ldr q18, [%[b_ptr0]]\n"
                    "fmla v26.4s, v19.4s, v0.s[1]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                    "fmla v27.4s, v19.4s, v3.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                    "fmla v28.4s, v19.4s, v6.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                    "fmla v29.4s, v19.4s, v9.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                    "fmla v30.4s, v19.4s, v12.s[1]\n"
                    "fmla v31.4s, v19.4s, v15.s[1]\n"
                    "fmla v26.4s, v20.4s, v0.s[2]\n"
                    "fmla v27.4s, v20.4s, v3.s[2]\n"
                    "fmla v28.4s, v20.4s, v6.s[2]\n"
                    "fmla v29.4s, v20.4s, v9.s[2]\n"
                    "fmla v30.4s, v20.4s, v12.s[2]\n"
                    "fmla v31.4s, v20.4s, v15.s[2]\n"
                    "fmla v26.4s, v21.4s, v0.s[3]\n"
                    "fmla v27.4s, v21.4s, v3.s[3]\n"
                    "fmla v28.4s, v21.4s, v6.s[3]\n"
                    "fmla v29.4s, v21.4s, v9.s[3]\n"
                    "fmla v30.4s, v21.4s, v12.s[3]\n"
                    "fmla v31.4s, v21.4s, v15.s[3]\n"
                    "fmla v26.4s, v22.4s, v1.s[0]\n"
                    "fmla v27.4s, v22.4s, v4.s[0]\n"
                    "fmla v28.4s, v22.4s, v7.s[0]\n"
                    "fmla v29.4s, v22.4s, v10.s[0]\n"
                    "fmla v30.4s, v22.4s, v13.s[0]\n"
                    "fmla v31.4s, v22.4s, v16.s[0]\n"
                    "fmla v26.4s, v23.4s, v1.s[1]\n"
                    "fmla v27.4s, v23.4s, v4.s[1]\n"
                    "fmla v28.4s, v23.4s, v7.s[1]\n"
                    "fmla v29.4s, v23.4s, v10.s[1]\n"
                    "fmla v30.4s, v23.4s, v13.s[1]\n"
                    "fmla v31.4s, v23.4s, v16.s[1]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v7.s[2]\n"
                    "fmla v29.4s, v24.4s, v10.s[2]\n"
                    "fmla v30.4s, v24.4s, v13.s[2]\n"
                    "fmla v31.4s, v24.4s, v16.s[2]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v7.s[3]\n"
                    "fmla v29.4s, v25.4s, v10.s[3]\n"
                    "fmla v30.4s, v25.4s, v13.s[3]\n"
                    "fmla v31.4s, v25.4s, v16.s[3]\n"
                    "fmla v26.4s, v18.4s, v2.s[0]\n"
                    "fmla v27.4s, v18.4s, v5.s[0]\n"
                    "fmla v28.4s, v18.4s, v8.s[0]\n"
                    "fmla v29.4s, v18.4s, v11.s[0]\n"
                    "fmla v30.4s, v18.4s, v14.s[0]\n"
                    "fmla v31.4s, v18.4s, v17.s[0]\n"
                    "b.ne 4b\n"
                    "3:\n"
                    "ld1r {v24.4s}, [%[minptr]]\n"
                    "ld1r {v25.4s}, [%[maxptr]]\n"
                    "ldr q18, [%[b_ptr0]]\n"
                    "ldr q19, [%[b_ptr0], #0x10]\n"
                    "fmax v26.4s, v26.4s, v24.4s\n"
                    "ldr q20, [%[b_ptr0], #0x20]\n"
                    "fmax v27.4s, v27.4s, v24.4s\n"
                    "ldr q21, [%[b_ptr0], #0x30]\n"
                    "fmax v28.4s, v28.4s, v24.4s\n"
                    "ldr q22, [%[b_ptr0], #0x40]\n"
                    "fmax v29.4s, v29.4s, v24.4s\n"
                    "ldr q23, [%[b_ptr0], #0x50]\n"
                    "fmin v26.4s, v26.4s, v25.4s\n"
                    "fmin v27.4s, v27.4s, v25.4s\n"
                    "fmin v28.4s, v28.4s, v25.4s\n"
                    "fmin v29.4s, v29.4s, v25.4s\n"
                    "str q26, [%[c_ptr0]]\n"
                    "fmax v30.4s, v30.4s, v24.4s\n"
                    "ldr q26, [%[biasptr]]\n"
                    "fmax v31.4s, v31.4s, v24.4s\n"
                    "ldr q24, [%[b_ptr0], #0x60]\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "str q27, [c_ptr1]\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v30.4s, v30.4s, v25.4s\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmin v31.4s, v31.4s, v25.4s\n"
                    "str q28, [c_ptr2]\n"
                    "mov v27.16b, v26.16b\n"
                    "ldr q25, [%[b_ptr0], #0x70]\n"
                    "mov v28.16b, v26.16b\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "str q29, [c_ptr3]\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "mov v29.16b, v26.16b\n"
                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                    "fmla v27.4s, v18.4s, v3.s[0]\n"
                    "str q30, [c_ptr4]\n"
                    "mov v30.16b, v26.16b\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "fmla v28.4s, v18.4s, v6.s[0]\n"
                    "str q31, [c_ptr5]\n"
                    "mov v31.16b, v26.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "fmla v26.4s, v18.4s, v0.s[0]\n"
                    "fmla v29.4s, v18.4s, v9.s[0]\n"
                    "fmla v30.4s, v18.4s, v12.s[0]\n"
                    "fmla v31.4s, v18.4s, v15.s[0]\n"
                    "ldr q18, [%[b_ptr0]]\n"
                    "fmla v26.4s, v19.4s, v0.s[1]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                    "fmla v27.4s, v19.4s, v3.s[1]\n"
                    "fmla v28.4s, v19.4s, v6.s[1]\n"
                    "fmla v29.4s, v19.4s, v9.s[1]\n"
                    "fmla v30.4s, v19.4s, v12.s[1]\n"
                    "fmla v31.4s, v19.4s, v15.s[1]\n"
                    "fmla v26.4s, v20.4s, v0.s[2]\n"
                    "fmla v27.4s, v20.4s, v3.s[2]\n"
                    "fmla v28.4s, v20.4s, v6.s[2]\n"
                    "fmla v29.4s, v20.4s, v9.s[2]\n"
                    "fmla v30.4s, v20.4s, v12.s[2]\n"
                    "fmla v31.4s, v20.4s, v15.s[2]\n"
                    "fmla v26.4s, v21.4s, v0.s[3]\n"
                    "fmla v27.4s, v21.4s, v3.s[3]\n"
                    "fmla v28.4s, v21.4s, v6.s[3]\n"
                    "fmla v29.4s, v21.4s, v9.s[3]\n"
                    "fmla v30.4s, v21.4s, v12.s[3]\n"
                    "fmla v31.4s, v21.4s, v15.s[3]\n"
                    "fmla v26.4s, v22.4s, v1.s[0]\n"
                    "fmla v27.4s, v22.4s, v4.s[0]\n"
                    "fmla v28.4s, v22.4s, v7.s[0]\n"
                    "fmla v29.4s, v22.4s, v10.s[0]\n"
                    "fmla v30.4s, v22.4s, v13.s[0]\n"
                    "fmla v31.4s, v22.4s, v16.s[0]\n"
                    "fmla v26.4s, v23.4s, v1.s[1]\n"
                    "fmla v27.4s, v23.4s, v4.s[1]\n"
                    "fmla v28.4s, v23.4s, v7.s[1]\n"
                    "fmla v29.4s, v23.4s, v10.s[1]\n"
                    "fmla v30.4s, v23.4s, v13.s[1]\n"
                    "fmla v31.4s, v23.4s, v16.s[1]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v7.s[2]\n"
                    "fmla v29.4s, v24.4s, v10.s[2]\n"
                    "fmla v30.4s, v24.4s, v13.s[2]\n"
                    "fmla v31.4s, v24.4s, v16.s[2]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v7.s[3]\n"
                    "fmla v29.4s, v25.4s, v10.s[3]\n"
                    "fmla v30.4s, v25.4s, v13.s[3]\n"
                    "fmla v31.4s, v25.4s, v16.s[3]\n"
                    "fmla v26.4s, v18.4s, v2.s[0]\n"
                    "fmla v27.4s, v18.4s, v5.s[0]\n"
                    "fmla v28.4s, v18.4s, v8.s[0]\n"
                    "fmla v29.4s, v18.4s, v11.s[0]\n"
                    "fmla v30.4s, v18.4s, v14.s[0]\n"
                    "fmla v31.4s, v18.4s, v17.s[0]\n"
                    "b 5f\n"
                    "2:\n"
                    "ldr q26, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "mov v27.16b, v26.16b\n"
                    "mov v28.16b, v26.16b\n"
                    "mov v29.16b, v26.16b\n"
                    "mov v30.16b, v26.16b\n"
                    "mov v31.16b, v26.16b\n"
                    "fmla v26.4s, v18.4s, v0.s[0]\n"
                    "fmla v27.4s, v18.4s, v3.s[0]\n"
                    "fmla v28.4s, v18.4s, v6.s[0]\n"
                    "fmla v29.4s, v18.4s, v9.s[0]\n"
                    "fmla v30.4s, v18.4s, v12.s[0]\n"
                    "fmla v31.4s, v18.4s, v15.s[0]\n"
                    "ldr q18, [%[b_ptr0]]\n"
                    "fmla v26.4s, v19.4s, v0.s[1]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                    "fmla v27.4s, v19.4s, v3.s[1]\n"
                    "fmla v28.4s, v19.4s, v6.s[1]\n"
                    "fmla v29.4s, v19.4s, v9.s[1]\n"
                    "fmla v30.4s, v19.4s, v12.s[1]\n"
                    "fmla v31.4s, v19.4s, v15.s[1]\n"
                    "fmla v26.4s, v20.4s, v0.s[2]\n"
                    "fmla v27.4s, v20.4s, v3.s[2]\n"
                    "fmla v28.4s, v20.4s, v6.s[2]\n"
                    "fmla v29.4s, v20.4s, v9.s[2]\n"
                    "fmla v30.4s, v20.4s, v12.s[2]\n"
                    "fmla v31.4s, v20.4s, v15.s[2]\n"
                    "fmla v26.4s, v21.4s, v0.s[3]\n"
                    "fmla v27.4s, v21.4s, v3.s[3]\n"
                    "fmla v28.4s, v21.4s, v6.s[3]\n"
                    "fmla v29.4s, v21.4s, v9.s[3]\n"
                    "fmla v30.4s, v21.4s, v12.s[3]\n"
                    "fmla v31.4s, v21.4s, v15.s[3]\n"
                    "fmla v26.4s, v22.4s, v1.s[0]\n"
                    "fmla v27.4s, v22.4s, v4.s[0]\n"
                    "fmla v28.4s, v22.4s, v7.s[0]\n"
                    "fmla v29.4s, v22.4s, v10.s[0]\n"
                    "fmla v30.4s, v22.4s, v13.s[0]\n"
                    "fmla v31.4s, v22.4s, v16.s[0]\n"
                    "fmla v26.4s, v23.4s, v1.s[1]\n"
                    "fmla v27.4s, v23.4s, v4.s[1]\n"
                    "fmla v28.4s, v23.4s, v7.s[1]\n"
                    "fmla v29.4s, v23.4s, v10.s[1]\n"
                    "fmla v30.4s, v23.4s, v13.s[1]\n"
                    "fmla v31.4s, v23.4s, v16.s[1]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v7.s[2]\n"
                    "fmla v29.4s, v24.4s, v10.s[2]\n"
                    "fmla v30.4s, v24.4s, v13.s[2]\n"
                    "fmla v31.4s, v24.4s, v16.s[2]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v7.s[3]\n"
                    "fmla v29.4s, v25.4s, v10.s[3]\n"
                    "fmla v30.4s, v25.4s, v13.s[3]\n"
                    "fmla v31.4s, v25.4s, v16.s[3]\n"
                    "fmla v26.4s, v18.4s, v2.s[0]\n"
                    "fmla v27.4s, v18.4s, v5.s[0]\n"
                    "fmla v28.4s, v18.4s, v8.s[0]\n"
                    "fmla v29.4s, v18.4s, v11.s[0]\n"
                    "fmla v30.4s, v18.4s, v14.s[0]\n"
                    "fmla v31.4s, v18.4s, v17.s[0]\n"
                    "5:\n"
                    "ld1r {v24.4s}, [%[minptr]]\n"
                    "ld1r {v25.4s}, [%[maxptr]]\n"
                    "fmax v26.4s, v26.4s, v24.4s\n"
                    "fmax v27.4s, v27.4s, v24.4s\n"
                    "fmax v28.4s, v28.4s, v24.4s\n"
                    "fmax v29.4s, v29.4s, v24.4s\n"
                    "fmin v26.4s, v26.4s, v25.4s\n"
                    "fmin v27.4s, v27.4s, v25.4s\n"
                    "fmin v28.4s, v28.4s, v25.4s\n"
                    "fmin v29.4s, v29.4s, v25.4s\n"
                    "str q26, [%[c_ptr0]]\n"
                    "fmax v30.4s, v30.4s, v24.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmax v31.4s, v31.4s, v24.4s\n"
                    "str q27, [c_ptr1]\n"
                    "fmin v30.4s, v30.4s, v25.4s\n"
                    "fmin v31.4s, v31.4s, v25.4s\n"
                    "str q28, [c_ptr2]\n"
                    "str q29, [c_ptr3]\n"
                    "str q30, [c_ptr4]\n"
                    "str q31, [c_ptr5]\n"
                    ".unreq a_ptr1\n"
                    ".unreq a_ptr2\n"
                    ".unreq a_ptr3\n"
                    ".unreq a_ptr4\n"
                    ".unreq a_ptr5\n"
                    ".unreq c_ptr1\n"
                    ".unreq c_ptr2\n"
                    ".unreq c_ptr3\n"
                    ".unreq c_ptr4\n"
                    ".unreq c_ptr5\n"
                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [biasptr] "+r" (biasptr)
                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
                );
                break;
            case 10:
                __asm __volatile (
                    "a_ptr1 .req X0\n"
                    "a_ptr2 .req X1\n"
                    "a_ptr3 .req X2\n"
                    "a_ptr4 .req X3\n"
                    "a_ptr5 .req X4\n"
                    "c_ptr1 .req X5\n"
                    "c_ptr2 .req X6\n"
                    "c_ptr3 .req X7\n"
                    "c_ptr4 .req X8\n"
                    "c_ptr5 .req X9\n"
                    "add a_ptr1, %[a_ptr0], %[lda]\n"
                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
                    "add a_ptr2, a_ptr1, %[lda]\n"
                    "add c_ptr2, c_ptr1, %[ldc]\n"
                    "add a_ptr3, a_ptr2, %[lda]\n"
                    "add c_ptr3, c_ptr2, %[ldc]\n"
                    "add a_ptr4, a_ptr3, %[lda]\n"
                    "add c_ptr4, c_ptr3, %[ldc]\n"
                    "add a_ptr5, a_ptr4, %[lda]\n"
                    "add c_ptr5, c_ptr4, %[ldc]\n"
                    "cbz %[oob_rows], 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr5, %[c_ptr0], #0x0\n"
                    "add a_ptr5, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr4, %[c_ptr0], #0x0\n"
                    "add a_ptr4, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr3, %[c_ptr0], #0x0\n"
                    "add a_ptr3, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr2, %[c_ptr0], #0x0\n"
                    "add a_ptr2, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr1, %[c_ptr0], #0x0\n"
                    "add a_ptr1, %[a_ptr0], #0x0\n"
                    "1:\n"
                    "ldr q0, [%[a_ptr0]], #0x10\n"
                    "ldr q3, [a_ptr1], #0x10\n"
                    "ldr q6, [a_ptr2], #0x10\n"
                    "ldr q9, [a_ptr3], #0x10\n"
                    "ldr q12, [a_ptr4], #0x10\n"
                    "ldr q15, [a_ptr5], #0x10\n"
                    "ldr q1, [%[a_ptr0]], #0x10\n"
                    "ldr q4, [a_ptr1], #0x10\n"
                    "ldr q7, [a_ptr2], #0x10\n"
                    "ldr q10, [a_ptr3], #0x10\n"
                    "ldr d2, [%[a_ptr0]]\n"
                    "ldr q13, [a_ptr4], #0x10\n"
                    "ldr d5, [a_ptr1]\n"
                    "ldr q16, [a_ptr5], #0x10\n"
                    "ldr d8, [a_ptr2]\n"
                    "ldr q18, [%[b_ptr0]]\n"
                    "ldr d11, [a_ptr3]\n"
                    "ldr q19, [%[b_ptr0], #0x10]\n"
                    "ldr d14, [a_ptr4]\n"
                    "ldr q20, [%[b_ptr0], #0x20]\n"
                    "ldr d17, [a_ptr5]\n"
                    "ldr q21, [%[b_ptr0], #0x30]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                    "ldr q22, [%[b_ptr0], #0x40]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                    "ldr q23, [%[b_ptr0], #0x50]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                    "ldr q24, [%[b_ptr0], #0x60]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                    "ldr q25, [%[b_ptr0], #0x70]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                    "cbz %[loops], 2f\n"
                    "ldr q26, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "mov v27.16b, v26.16b\n"
                    "mov v28.16b, v26.16b\n"
                    "mov v29.16b, v26.16b\n"
                    "mov v30.16b, v26.16b\n"
                    "mov v31.16b, v26.16b\n"
                    "fmla v26.4s, v18.4s, v0.s[0]\n"
                    "fmla v27.4s, v18.4s, v3.s[0]\n"
                    "fmla v28.4s, v18.4s, v6.s[0]\n"
                    "fmla v29.4s, v18.4s, v9.s[0]\n"
                    "fmla v30.4s, v18.4s, v12.s[0]\n"
                    "fmla v31.4s, v18.4s, v15.s[0]\n"
                    "ldr q18, [%[b_ptr0]]\n"
                    "fmla v26.4s, v19.4s, v0.s[1]\n"
                    "fmla v27.4s, v19.4s, v3.s[1]\n"
                    "fmla v28.4s, v19.4s, v6.s[1]\n"
                    "fmla v29.4s, v19.4s, v9.s[1]\n"
                    "fmla v30.4s, v19.4s, v12.s[1]\n"
                    "fmla v31.4s, v19.4s, v15.s[1]\n"
                    "ldr q19, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v20.4s, v0.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v20.4s, v3.s[2]\n"
                    "fmla v28.4s, v20.4s, v6.s[2]\n"
                    "fmla v29.4s, v20.4s, v9.s[2]\n"
                    "fmla v30.4s, v20.4s, v12.s[2]\n"
                    "fmla v31.4s, v20.4s, v15.s[2]\n"
                    "fmla v26.4s, v21.4s, v0.s[3]\n"
                    "fmla v27.4s, v21.4s, v3.s[3]\n"
                    "fmla v28.4s, v21.4s, v6.s[3]\n"
                    "fmla v29.4s, v21.4s, v9.s[3]\n"
                    "fmla v30.4s, v21.4s, v12.s[3]\n"
                    "fmla v31.4s, v21.4s, v15.s[3]\n"
                    "fmla v26.4s, v22.4s, v1.s[0]\n"
                    "fmla v27.4s, v22.4s, v4.s[0]\n"
                    "fmla v28.4s, v22.4s, v7.s[0]\n"
                    "fmla v29.4s, v22.4s, v10.s[0]\n"
                    "fmla v30.4s, v22.4s, v13.s[0]\n"
                    "fmla v31.4s, v22.4s, v16.s[0]\n"
                    "fmla v26.4s, v23.4s, v1.s[1]\n"
                    "fmla v27.4s, v23.4s, v4.s[1]\n"
                    "fmla v28.4s, v23.4s, v7.s[1]\n"
                    "fmla v29.4s, v23.4s, v10.s[1]\n"
                    "fmla v30.4s, v23.4s, v13.s[1]\n"
                    "fmla v31.4s, v23.4s, v16.s[1]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v7.s[2]\n"
                    "fmla v29.4s, v24.4s, v10.s[2]\n"
                    "fmla v30.4s, v24.4s, v13.s[2]\n"
                    "fmla v31.4s, v24.4s, v16.s[2]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v7.s[3]\n"
                    "fmla v29.4s, v25.4s, v10.s[3]\n"
                    "fmla v30.4s, v25.4s, v13.s[3]\n"
                    "fmla v31.4s, v25.4s, v16.s[3]\n"
                    "fmla v26.4s, v18.4s, v2.s[0]\n"
                    "fmla v27.4s, v18.4s, v5.s[0]\n"
                    "fmla v28.4s, v18.4s, v8.s[0]\n"
                    "fmla v29.4s, v18.4s, v11.s[0]\n"
                    "fmla v30.4s, v18.4s, v14.s[0]\n"
                    "fmla v31.4s, v18.4s, v17.s[0]\n"
                    "fmla v26.4s, v19.4s, v2.s[1]\n"
                    "fmla v27.4s, v19.4s, v5.s[1]\n"
                    "fmla v28.4s, v19.4s, v8.s[1]\n"
                    "fmla v29.4s, v19.4s, v11.s[1]\n"
                    "fmla v30.4s, v19.4s, v14.s[1]\n"
                    "fmla v31.4s, v19.4s, v17.s[1]\n"
                    "b.eq 3f\n"
                    "4:\n"
                    "ld1r {v24.4s}, [%[minptr]]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "ld1r {v25.4s}, [%[maxptr]]\n"
                    "ldr q18, [%[b_ptr0]]\n"
                    "fmax v26.4s, v26.4s, v24.4s\n"
                    "ldr q19, [%[b_ptr0], #0x10]\n"
                    "fmax v27.4s, v27.4s, v24.4s\n"
                    "ldr q20, [%[b_ptr0], #0x20]\n"
                    "fmax v28.4s, v28.4s, v24.4s\n"
                    "ldr q21, [%[b_ptr0], #0x30]\n"
                    "fmax v29.4s, v29.4s, v24.4s\n"
                    "ldr q22, [%[b_ptr0], #0x40]\n"
                    "fmin v26.4s, v26.4s, v25.4s\n"
                    "ldr q23, [%[b_ptr0], #0x50]\n"
                    "fmin v27.4s, v27.4s, v25.4s\n"
                    "fmin v28.4s, v28.4s, v25.4s\n"
                    "fmin v29.4s, v29.4s, v25.4s\n"
                    "str q26, [%[c_ptr0]]\n"
                    "fmax v30.4s, v30.4s, v24.4s\n"
                    "ldr q26, [%[biasptr]]\n"
                    "fmax v31.4s, v31.4s, v24.4s\n"
                    "ldr q24, [%[b_ptr0], #0x60]\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "str q27, [c_ptr1]\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v30.4s, v30.4s, v25.4s\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmin v31.4s, v31.4s, v25.4s\n"
                    "str q28, [c_ptr2]\n"
                    "mov v27.16b, v26.16b\n"
                    "ldr q25, [%[b_ptr0], #0x70]\n"
                    "mov v28.16b, v26.16b\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "str q29, [c_ptr3]\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "mov v29.16b, v26.16b\n"
                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                    "fmla v27.4s, v18.4s, v3.s[0]\n"
                    "str q30, [c_ptr4]\n"
                    "mov v30.16b, v26.16b\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "fmla v28.4s, v18.4s, v6.s[0]\n"
                    "str q31, [c_ptr5]\n"
                    "mov v31.16b, v26.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "fmla v26.4s, v18.4s, v0.s[0]\n"
                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                    "fmla v29.4s, v18.4s, v9.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                    "fmla v30.4s, v18.4s, v12.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                    "fmla v31.4s, v18.4s, v15.s[0]\n"
                    "ldr q18, [%[b_ptr0]]\n"
                    "fmla v26.4s, v19.4s, v0.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                    "fmla v27.4s, v19.4s, v3.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                    "fmla v28.4s, v19.4s, v6.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                    "fmla v29.4s, v19.4s, v9.s[1]\n"
                    "fmla v30.4s, v19.4s, v12.s[1]\n"
                    "fmla v31.4s, v19.4s, v15.s[1]\n"
                    "ldr q19, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v20.4s, v0.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v20.4s, v3.s[2]\n"
                    "fmla v28.4s, v20.4s, v6.s[2]\n"
                    "fmla v29.4s, v20.4s, v9.s[2]\n"
                    "fmla v30.4s, v20.4s, v12.s[2]\n"
                    "fmla v31.4s, v20.4s, v15.s[2]\n"
                    "fmla v26.4s, v21.4s, v0.s[3]\n"
                    "fmla v27.4s, v21.4s, v3.s[3]\n"
                    "fmla v28.4s, v21.4s, v6.s[3]\n"
                    "fmla v29.4s, v21.4s, v9.s[3]\n"
                    "fmla v30.4s, v21.4s, v12.s[3]\n"
                    "fmla v31.4s, v21.4s, v15.s[3]\n"
                    "fmla v26.4s, v22.4s, v1.s[0]\n"
                    "fmla v27.4s, v22.4s, v4.s[0]\n"
                    "fmla v28.4s, v22.4s, v7.s[0]\n"
                    "fmla v29.4s, v22.4s, v10.s[0]\n"
                    "fmla v30.4s, v22.4s, v13.s[0]\n"
                    "fmla v31.4s, v22.4s, v16.s[0]\n"
                    "fmla v26.4s, v23.4s, v1.s[1]\n"
                    "fmla v27.4s, v23.4s, v4.s[1]\n"
                    "fmla v28.4s, v23.4s, v7.s[1]\n"
                    "fmla v29.4s, v23.4s, v10.s[1]\n"
                    "fmla v30.4s, v23.4s, v13.s[1]\n"
                    "fmla v31.4s, v23.4s, v16.s[1]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v7.s[2]\n"
                    "fmla v29.4s, v24.4s, v10.s[2]\n"
                    "fmla v30.4s, v24.4s, v13.s[2]\n"
                    "fmla v31.4s, v24.4s, v16.s[2]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v7.s[3]\n"
                    "fmla v29.4s, v25.4s, v10.s[3]\n"
                    "fmla v30.4s, v25.4s, v13.s[3]\n"
                    "fmla v31.4s, v25.4s, v16.s[3]\n"
                    "fmla v26.4s, v18.4s, v2.s[0]\n"
                    "fmla v27.4s, v18.4s, v5.s[0]\n"
                    "fmla v28.4s, v18.4s, v8.s[0]\n"
                    "fmla v29.4s, v18.4s, v11.s[0]\n"
                    "fmla v30.4s, v18.4s, v14.s[0]\n"
                    "fmla v31.4s, v18.4s, v17.s[0]\n"
                    "fmla v26.4s, v19.4s, v2.s[1]\n"
                    "fmla v27.4s, v19.4s, v5.s[1]\n"
                    "fmla v28.4s, v19.4s, v8.s[1]\n"
                    "fmla v29.4s, v19.4s, v11.s[1]\n"
                    "fmla v30.4s, v19.4s, v14.s[1]\n"
                    "fmla v31.4s, v19.4s, v17.s[1]\n"
                    "b.ne 4b\n"
                    "3:\n"
                    "ld1r {v24.4s}, [%[minptr]]\n"
                    "ld1r {v25.4s}, [%[maxptr]]\n"
                    "ldr q18, [%[b_ptr0]]\n"
                    "ldr q19, [%[b_ptr0], #0x10]\n"
                    "fmax v26.4s, v26.4s, v24.4s\n"
                    "ldr q20, [%[b_ptr0], #0x20]\n"
                    "fmax v27.4s, v27.4s, v24.4s\n"
                    "ldr q21, [%[b_ptr0], #0x30]\n"
                    "fmax v28.4s, v28.4s, v24.4s\n"
                    "ldr q22, [%[b_ptr0], #0x40]\n"
                    "fmax v29.4s, v29.4s, v24.4s\n"
                    "ldr q23, [%[b_ptr0], #0x50]\n"
                    "fmin v26.4s, v26.4s, v25.4s\n"
                    "fmin v27.4s, v27.4s, v25.4s\n"
                    "fmin v28.4s, v28.4s, v25.4s\n"
                    "fmin v29.4s, v29.4s, v25.4s\n"
                    "str q26, [%[c_ptr0]]\n"
                    "fmax v30.4s, v30.4s, v24.4s\n"
                    "ldr q26, [%[biasptr]]\n"
                    "fmax v31.4s, v31.4s, v24.4s\n"
                    "ldr q24, [%[b_ptr0], #0x60]\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "str q27, [c_ptr1]\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v30.4s, v30.4s, v25.4s\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmin v31.4s, v31.4s, v25.4s\n"
                    "str q28, [c_ptr2]\n"
                    "mov v27.16b, v26.16b\n"
                    "ldr q25, [%[b_ptr0], #0x70]\n"
                    "mov v28.16b, v26.16b\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "str q29, [c_ptr3]\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "mov v29.16b, v26.16b\n"
                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                    "fmla v27.4s, v18.4s, v3.s[0]\n"
                    "str q30, [c_ptr4]\n"
                    "mov v30.16b, v26.16b\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "fmla v28.4s, v18.4s, v6.s[0]\n"
                    "str q31, [c_ptr5]\n"
                    "mov v31.16b, v26.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "fmla v26.4s, v18.4s, v0.s[0]\n"
                    "fmla v29.4s, v18.4s, v9.s[0]\n"
                    "fmla v30.4s, v18.4s, v12.s[0]\n"
                    "fmla v31.4s, v18.4s, v15.s[0]\n"
                    "ldr q18, [%[b_ptr0]]\n"
                    "fmla v26.4s, v19.4s, v0.s[1]\n"
                    "fmla v27.4s, v19.4s, v3.s[1]\n"
                    "fmla v28.4s, v19.4s, v6.s[1]\n"
                    "fmla v29.4s, v19.4s, v9.s[1]\n"
                    "fmla v30.4s, v19.4s, v12.s[1]\n"
                    "fmla v31.4s, v19.4s, v15.s[1]\n"
                    "ldr q19, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v20.4s, v0.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v20.4s, v3.s[2]\n"
                    "fmla v28.4s, v20.4s, v6.s[2]\n"
                    "fmla v29.4s, v20.4s, v9.s[2]\n"
                    "fmla v30.4s, v20.4s, v12.s[2]\n"
                    "fmla v31.4s, v20.4s, v15.s[2]\n"
                    "fmla v26.4s, v21.4s, v0.s[3]\n"
                    "fmla v27.4s, v21.4s, v3.s[3]\n"
                    "fmla v28.4s, v21.4s, v6.s[3]\n"
                    "fmla v29.4s, v21.4s, v9.s[3]\n"
                    "fmla v30.4s, v21.4s, v12.s[3]\n"
                    "fmla v31.4s, v21.4s, v15.s[3]\n"
                    "fmla v26.4s, v22.4s, v1.s[0]\n"
                    "fmla v27.4s, v22.4s, v4.s[0]\n"
                    "fmla v28.4s, v22.4s, v7.s[0]\n"
                    "fmla v29.4s, v22.4s, v10.s[0]\n"
                    "fmla v30.4s, v22.4s, v13.s[0]\n"
                    "fmla v31.4s, v22.4s, v16.s[0]\n"
                    "fmla v26.4s, v23.4s, v1.s[1]\n"
                    "fmla v27.4s, v23.4s, v4.s[1]\n"
                    "fmla v28.4s, v23.4s, v7.s[1]\n"
                    "fmla v29.4s, v23.4s, v10.s[1]\n"
                    "fmla v30.4s, v23.4s, v13.s[1]\n"
                    "fmla v31.4s, v23.4s, v16.s[1]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v7.s[2]\n"
                    "fmla v29.4s, v24.4s, v10.s[2]\n"
                    "fmla v30.4s, v24.4s, v13.s[2]\n"
                    "fmla v31.4s, v24.4s, v16.s[2]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v7.s[3]\n"
                    "fmla v29.4s, v25.4s, v10.s[3]\n"
                    "fmla v30.4s, v25.4s, v13.s[3]\n"
                    "fmla v31.4s, v25.4s, v16.s[3]\n"
                    "fmla v26.4s, v18.4s, v2.s[0]\n"
                    "fmla v27.4s, v18.4s, v5.s[0]\n"
                    "fmla v28.4s, v18.4s, v8.s[0]\n"
                    "fmla v29.4s, v18.4s, v11.s[0]\n"
                    "fmla v30.4s, v18.4s, v14.s[0]\n"
                    "fmla v31.4s, v18.4s, v17.s[0]\n"
                    "fmla v26.4s, v19.4s, v2.s[1]\n"
                    "fmla v27.4s, v19.4s, v5.s[1]\n"
                    "fmla v28.4s, v19.4s, v8.s[1]\n"
                    "fmla v29.4s, v19.4s, v11.s[1]\n"
                    "fmla v30.4s, v19.4s, v14.s[1]\n"
                    "fmla v31.4s, v19.4s, v17.s[1]\n"
                    "b 5f\n"
                    "2:\n"
                    "ldr q26, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "mov v27.16b, v26.16b\n"
                    "mov v28.16b, v26.16b\n"
                    "mov v29.16b, v26.16b\n"
                    "mov v30.16b, v26.16b\n"
                    "mov v31.16b, v26.16b\n"
                    "fmla v26.4s, v18.4s, v0.s[0]\n"
                    "fmla v27.4s, v18.4s, v3.s[0]\n"
                    "fmla v28.4s, v18.4s, v6.s[0]\n"
                    "fmla v29.4s, v18.4s, v9.s[0]\n"
                    "fmla v30.4s, v18.4s, v12.s[0]\n"
                    "fmla v31.4s, v18.4s, v15.s[0]\n"
                    "ldr q18, [%[b_ptr0]]\n"
                    "fmla v26.4s, v19.4s, v0.s[1]\n"
                    "fmla v27.4s, v19.4s, v3.s[1]\n"
                    "fmla v28.4s, v19.4s, v6.s[1]\n"
                    "fmla v29.4s, v19.4s, v9.s[1]\n"
                    "fmla v30.4s, v19.4s, v12.s[1]\n"
                    "fmla v31.4s, v19.4s, v15.s[1]\n"
                    "ldr q19, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v20.4s, v0.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v20.4s, v3.s[2]\n"
                    "fmla v28.4s, v20.4s, v6.s[2]\n"
                    "fmla v29.4s, v20.4s, v9.s[2]\n"
                    "fmla v30.4s, v20.4s, v12.s[2]\n"
                    "fmla v31.4s, v20.4s, v15.s[2]\n"
                    "fmla v26.4s, v21.4s, v0.s[3]\n"
                    "fmla v27.4s, v21.4s, v3.s[3]\n"
                    "fmla v28.4s, v21.4s, v6.s[3]\n"
                    "fmla v29.4s, v21.4s, v9.s[3]\n"
                    "fmla v30.4s, v21.4s, v12.s[3]\n"
                    "fmla v31.4s, v21.4s, v15.s[3]\n"
                    "fmla v26.4s, v22.4s, v1.s[0]\n"
                    "fmla v27.4s, v22.4s, v4.s[0]\n"
                    "fmla v28.4s, v22.4s, v7.s[0]\n"
                    "fmla v29.4s, v22.4s, v10.s[0]\n"
                    "fmla v30.4s, v22.4s, v13.s[0]\n"
                    "fmla v31.4s, v22.4s, v16.s[0]\n"
                    "fmla v26.4s, v23.4s, v1.s[1]\n"
                    "fmla v27.4s, v23.4s, v4.s[1]\n"
                    "fmla v28.4s, v23.4s, v7.s[1]\n"
                    "fmla v29.4s, v23.4s, v10.s[1]\n"
                    "fmla v30.4s, v23.4s, v13.s[1]\n"
                    "fmla v31.4s, v23.4s, v16.s[1]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v7.s[2]\n"
                    "fmla v29.4s, v24.4s, v10.s[2]\n"
                    "fmla v30.4s, v24.4s, v13.s[2]\n"
                    "fmla v31.4s, v24.4s, v16.s[2]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v7.s[3]\n"
                    "fmla v29.4s, v25.4s, v10.s[3]\n"
                    "fmla v30.4s, v25.4s, v13.s[3]\n"
                    "fmla v31.4s, v25.4s, v16.s[3]\n"
                    "fmla v26.4s, v18.4s, v2.s[0]\n"
                    "fmla v27.4s, v18.4s, v5.s[0]\n"
                    "fmla v28.4s, v18.4s, v8.s[0]\n"
                    "fmla v29.4s, v18.4s, v11.s[0]\n"
                    "fmla v30.4s, v18.4s, v14.s[0]\n"
                    "fmla v31.4s, v18.4s, v17.s[0]\n"
                    "fmla v26.4s, v19.4s, v2.s[1]\n"
                    "fmla v27.4s, v19.4s, v5.s[1]\n"
                    "fmla v28.4s, v19.4s, v8.s[1]\n"
                    "fmla v29.4s, v19.4s, v11.s[1]\n"
                    "fmla v30.4s, v19.4s, v14.s[1]\n"
                    "fmla v31.4s, v19.4s, v17.s[1]\n"
                    "5:\n"
                    "ld1r {v24.4s}, [%[minptr]]\n"
                    "ld1r {v25.4s}, [%[maxptr]]\n"
                    "fmax v26.4s, v26.4s, v24.4s\n"
                    "fmax v27.4s, v27.4s, v24.4s\n"
                    "fmax v28.4s, v28.4s, v24.4s\n"
                    "fmax v29.4s, v29.4s, v24.4s\n"
                    "fmin v26.4s, v26.4s, v25.4s\n"
                    "fmin v27.4s, v27.4s, v25.4s\n"
                    "fmin v28.4s, v28.4s, v25.4s\n"
                    "fmin v29.4s, v29.4s, v25.4s\n"
                    "str q26, [%[c_ptr0]]\n"
                    "fmax v30.4s, v30.4s, v24.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmax v31.4s, v31.4s, v24.4s\n"
                    "str q27, [c_ptr1]\n"
                    "fmin v30.4s, v30.4s, v25.4s\n"
                    "fmin v31.4s, v31.4s, v25.4s\n"
                    "str q28, [c_ptr2]\n"
                    "str q29, [c_ptr3]\n"
                    "str q30, [c_ptr4]\n"
                    "str q31, [c_ptr5]\n"
                    ".unreq a_ptr1\n"
                    ".unreq a_ptr2\n"
                    ".unreq a_ptr3\n"
                    ".unreq a_ptr4\n"
                    ".unreq a_ptr5\n"
                    ".unreq c_ptr1\n"
                    ".unreq c_ptr2\n"
                    ".unreq c_ptr3\n"
                    ".unreq c_ptr4\n"
                    ".unreq c_ptr5\n"
                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [biasptr] "+r" (biasptr)
                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
                );
                break;
            case 11:
                __asm __volatile (
                    "a_ptr1 .req X0\n"
                    "a_ptr2 .req X1\n"
                    "a_ptr3 .req X2\n"
                    "a_ptr4 .req X3\n"
                    "a_ptr5 .req X4\n"
                    "c_ptr1 .req X5\n"
                    "c_ptr2 .req X6\n"
                    "c_ptr3 .req X7\n"
                    "c_ptr4 .req X8\n"
                    "c_ptr5 .req X9\n"
                    "add a_ptr1, %[a_ptr0], %[lda]\n"
                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
                    "add a_ptr2, a_ptr1, %[lda]\n"
                    "add c_ptr2, c_ptr1, %[ldc]\n"
                    "add a_ptr3, a_ptr2, %[lda]\n"
                    "add c_ptr3, c_ptr2, %[ldc]\n"
                    "add a_ptr4, a_ptr3, %[lda]\n"
                    "add c_ptr4, c_ptr3, %[ldc]\n"
                    "add a_ptr5, a_ptr4, %[lda]\n"
                    "add c_ptr5, c_ptr4, %[ldc]\n"
                    "cbz %[oob_rows], 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr5, %[c_ptr0], #0x0\n"
                    "add a_ptr5, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr4, %[c_ptr0], #0x0\n"
                    "add a_ptr4, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr3, %[c_ptr0], #0x0\n"
                    "add a_ptr3, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr2, %[c_ptr0], #0x0\n"
                    "add a_ptr2, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr1, %[c_ptr0], #0x0\n"
                    "add a_ptr1, %[a_ptr0], #0x0\n"
                    "1:\n"
                    "ldr q0, [%[a_ptr0]], #0x10\n"
                    "ldr q3, [a_ptr1], #0x10\n"
                    "ldr q6, [a_ptr2], #0x10\n"
                    "ldr q9, [a_ptr3], #0x10\n"
                    "ldr q12, [a_ptr4], #0x10\n"
                    "ldr q15, [a_ptr5], #0x10\n"
                    "ldr q1, [%[a_ptr0]], #0x10\n"
                    "ldr q4, [a_ptr1], #0x10\n"
                    "ldr q7, [a_ptr2], #0x10\n"
                    "ldr q10, [a_ptr3], #0x10\n"
                    "ldr d2, [%[a_ptr0]], #0x8\n"
                    "ldr q13, [a_ptr4], #0x10\n"
                    "ldr d5, [a_ptr1], #0x8\n"
                    "ldr q16, [a_ptr5], #0x10\n"
                    "ldr d8, [a_ptr2], #0x8\n"
                    "ldr q18, [%[b_ptr0]]\n"
                    "ldr d11, [a_ptr3], #0x8\n"
                    "ldr q19, [%[b_ptr0], #0x10]\n"
                    "ldr d14, [a_ptr4], #0x8\n"
                    "ldr q20, [%[b_ptr0], #0x20]\n"
                    "ldr d17, [a_ptr5], #0x8\n"
                    "ldr q21, [%[b_ptr0], #0x30]\n"
                    "ld1 {v2.s}[2], [%[a_ptr0]]\n"
                    "ldr q22, [%[b_ptr0], #0x40]\n"
                    "ld1 {v5.s}[2], [a_ptr1]\n"
                    "ldr q23, [%[b_ptr0], #0x50]\n"
                    "ld1 {v8.s}[2], [a_ptr2]\n"
                    "ldr q24, [%[b_ptr0], #0x60]\n"
                    "ld1 {v11.s}[2], [a_ptr3]\n"
                    "ldr q25, [%[b_ptr0], #0x70]\n"
                    "ld1 {v14.s}[2], [a_ptr4]\n"
                    "ld1 {v17.s}[2], [a_ptr5]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                    "cbz %[loops], 2f\n"
                    "ldr q26, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "mov v27.16b, v26.16b\n"
                    "mov v28.16b, v26.16b\n"
                    "mov v29.16b, v26.16b\n"
                    "mov v30.16b, v26.16b\n"
                    "mov v31.16b, v26.16b\n"
                    "fmla v26.4s, v18.4s, v0.s[0]\n"
                    "fmla v27.4s, v18.4s, v3.s[0]\n"
                    "fmla v28.4s, v18.4s, v6.s[0]\n"
                    "fmla v29.4s, v18.4s, v9.s[0]\n"
                    "fmla v30.4s, v18.4s, v12.s[0]\n"
                    "fmla v31.4s, v18.4s, v15.s[0]\n"
                    "ldr q18, [%[b_ptr0]]\n"
                    "fmla v26.4s, v19.4s, v0.s[1]\n"
                    "fmla v27.4s, v19.4s, v3.s[1]\n"
                    "fmla v28.4s, v19.4s, v6.s[1]\n"
                    "fmla v29.4s, v19.4s, v9.s[1]\n"
                    "fmla v30.4s, v19.4s, v12.s[1]\n"
                    "fmla v31.4s, v19.4s, v15.s[1]\n"
                    "ldr q19, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v20.4s, v0.s[2]\n"
                    "fmla v27.4s, v20.4s, v3.s[2]\n"
                    "fmla v28.4s, v20.4s, v6.s[2]\n"
                    "fmla v29.4s, v20.4s, v9.s[2]\n"
                    "fmla v30.4s, v20.4s, v12.s[2]\n"
                    "fmla v31.4s, v20.4s, v15.s[2]\n"
                    "ldr q20, [%[b_ptr0], #0x20]\n"
                    "fmla v26.4s, v21.4s, v0.s[3]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                    "fmla v27.4s, v21.4s, v3.s[3]\n"
                    "fmla v28.4s, v21.4s, v6.s[3]\n"
                    "fmla v29.4s, v21.4s, v9.s[3]\n"
                    "fmla v30.4s, v21.4s, v12.s[3]\n"
                    "fmla v31.4s, v21.4s, v15.s[3]\n"
                    "fmla v26.4s, v22.4s, v1.s[0]\n"
                    "fmla v27.4s, v22.4s, v4.s[0]\n"
                    "fmla v28.4s, v22.4s, v7.s[0]\n"
                    "fmla v29.4s, v22.4s, v10.s[0]\n"
                    "fmla v30.4s, v22.4s, v13.s[0]\n"
                    "fmla v31.4s, v22.4s, v16.s[0]\n"
                    "fmla v26.4s, v23.4s, v1.s[1]\n"
                    "fmla v27.4s, v23.4s, v4.s[1]\n"
                    "fmla v28.4s, v23.4s, v7.s[1]\n"
                    "fmla v29.4s, v23.4s, v10.s[1]\n"
                    "fmla v30.4s, v23.4s, v13.s[1]\n"
                    "fmla v31.4s, v23.4s, v16.s[1]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v7.s[2]\n"
                    "fmla v29.4s, v24.4s, v10.s[2]\n"
                    "fmla v30.4s, v24.4s, v13.s[2]\n"
                    "fmla v31.4s, v24.4s, v16.s[2]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v7.s[3]\n"
                    "fmla v29.4s, v25.4s, v10.s[3]\n"
                    "fmla v30.4s, v25.4s, v13.s[3]\n"
                    "fmla v31.4s, v25.4s, v16.s[3]\n"
                    "fmla v26.4s, v18.4s, v2.s[0]\n"
                    "fmla v27.4s, v18.4s, v5.s[0]\n"
                    "fmla v28.4s, v18.4s, v8.s[0]\n"
                    "fmla v29.4s, v18.4s, v11.s[0]\n"
                    "fmla v30.4s, v18.4s, v14.s[0]\n"
                    "fmla v31.4s, v18.4s, v17.s[0]\n"
                    "fmla v26.4s, v19.4s, v2.s[1]\n"
                    "fmla v27.4s, v19.4s, v5.s[1]\n"
                    "fmla v28.4s, v19.4s, v8.s[1]\n"
                    "fmla v29.4s, v19.4s, v11.s[1]\n"
                    "fmla v30.4s, v19.4s, v14.s[1]\n"
                    "fmla v31.4s, v19.4s, v17.s[1]\n"
                    "fmla v26.4s, v20.4s, v2.s[2]\n"
                    "fmla v27.4s, v20.4s, v5.s[2]\n"
                    "fmla v28.4s, v20.4s, v8.s[2]\n"
                    "fmla v29.4s, v20.4s, v11.s[2]\n"
                    "fmla v30.4s, v20.4s, v14.s[2]\n"
                    "fmla v31.4s, v20.4s, v17.s[2]\n"
                    "b.eq 3f\n"
                    "4:\n"
                    "ld1r {v24.4s}, [%[minptr]]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "ld1r {v25.4s}, [%[maxptr]]\n"
                    "ldr q18, [%[b_ptr0]]\n"
                    "fmax v26.4s, v26.4s, v24.4s\n"
                    "ldr q19, [%[b_ptr0], #0x10]\n"
                    "fmax v27.4s, v27.4s, v24.4s\n"
                    "ldr q20, [%[b_ptr0], #0x20]\n"
                    "fmax v28.4s, v28.4s, v24.4s\n"
                    "ldr q21, [%[b_ptr0], #0x30]\n"
                    "fmax v29.4s, v29.4s, v24.4s\n"
                    "ldr q22, [%[b_ptr0], #0x40]\n"
                    "fmin v26.4s, v26.4s, v25.4s\n"
                    "ldr q23, [%[b_ptr0], #0x50]\n"
                    "fmin v27.4s, v27.4s, v25.4s\n"
                    "fmin v28.4s, v28.4s, v25.4s\n"
                    "fmin v29.4s, v29.4s, v25.4s\n"
                    "str q26, [%[c_ptr0]]\n"
                    "fmax v30.4s, v30.4s, v24.4s\n"
                    "ldr q26, [%[biasptr]]\n"
                    "fmax v31.4s, v31.4s, v24.4s\n"
                    "ldr q24, [%[b_ptr0], #0x60]\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "str q27, [c_ptr1]\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v30.4s, v30.4s, v25.4s\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmin v31.4s, v31.4s, v25.4s\n"
                    "str q28, [c_ptr2]\n"
                    "mov v27.16b, v26.16b\n"
                    "ldr q25, [%[b_ptr0], #0x70]\n"
                    "mov v28.16b, v26.16b\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "str q29, [c_ptr3]\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "mov v29.16b, v26.16b\n"
                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                    "fmla v27.4s, v18.4s, v3.s[0]\n"
                    "str q30, [c_ptr4]\n"
                    "mov v30.16b, v26.16b\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "fmla v28.4s, v18.4s, v6.s[0]\n"
                    "str q31, [c_ptr5]\n"
                    "mov v31.16b, v26.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "fmla v26.4s, v18.4s, v0.s[0]\n"
                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                    "fmla v29.4s, v18.4s, v9.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                    "fmla v30.4s, v18.4s, v12.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                    "fmla v31.4s, v18.4s, v15.s[0]\n"
                    "ldr q18, [%[b_ptr0]]\n"
                    "fmla v26.4s, v19.4s, v0.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                    "fmla v27.4s, v19.4s, v3.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                    "fmla v28.4s, v19.4s, v6.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                    "fmla v29.4s, v19.4s, v9.s[1]\n"
                    "fmla v30.4s, v19.4s, v12.s[1]\n"
                    "fmla v31.4s, v19.4s, v15.s[1]\n"
                    "ldr q19, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v20.4s, v0.s[2]\n"
                    "fmla v27.4s, v20.4s, v3.s[2]\n"
                    "fmla v28.4s, v20.4s, v6.s[2]\n"
                    "fmla v29.4s, v20.4s, v9.s[2]\n"
                    "fmla v30.4s, v20.4s, v12.s[2]\n"
                    "fmla v31.4s, v20.4s, v15.s[2]\n"
                    "ldr q20, [%[b_ptr0], #0x20]\n"
                    "fmla v26.4s, v21.4s, v0.s[3]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                    "fmla v27.4s, v21.4s, v3.s[3]\n"
                    "fmla v28.4s, v21.4s, v6.s[3]\n"
                    "fmla v29.4s, v21.4s, v9.s[3]\n"
                    "fmla v30.4s, v21.4s, v12.s[3]\n"
                    "fmla v31.4s, v21.4s, v15.s[3]\n"
                    "fmla v26.4s, v22.4s, v1.s[0]\n"
                    "fmla v27.4s, v22.4s, v4.s[0]\n"
                    "fmla v28.4s, v22.4s, v7.s[0]\n"
                    "fmla v29.4s, v22.4s, v10.s[0]\n"
                    "fmla v30.4s, v22.4s, v13.s[0]\n"
                    "fmla v31.4s, v22.4s, v16.s[0]\n"
                    "fmla v26.4s, v23.4s, v1.s[1]\n"
                    "fmla v27.4s, v23.4s, v4.s[1]\n"
                    "fmla v28.4s, v23.4s, v7.s[1]\n"
                    "fmla v29.4s, v23.4s, v10.s[1]\n"
                    "fmla v30.4s, v23.4s, v13.s[1]\n"
                    "fmla v31.4s, v23.4s, v16.s[1]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v7.s[2]\n"
                    "fmla v29.4s, v24.4s, v10.s[2]\n"
                    "fmla v30.4s, v24.4s, v13.s[2]\n"
                    "fmla v31.4s, v24.4s, v16.s[2]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v7.s[3]\n"
                    "fmla v29.4s, v25.4s, v10.s[3]\n"
                    "fmla v30.4s, v25.4s, v13.s[3]\n"
                    "fmla v31.4s, v25.4s, v16.s[3]\n"
                    "fmla v26.4s, v18.4s, v2.s[0]\n"
                    "fmla v27.4s, v18.4s, v5.s[0]\n"
                    "fmla v28.4s, v18.4s, v8.s[0]\n"
                    "fmla v29.4s, v18.4s, v11.s[0]\n"
                    "fmla v30.4s, v18.4s, v14.s[0]\n"
                    "fmla v31.4s, v18.4s, v17.s[0]\n"
                    "fmla v26.4s, v19.4s, v2.s[1]\n"
                    "fmla v27.4s, v19.4s, v5.s[1]\n"
                    "fmla v28.4s, v19.4s, v8.s[1]\n"
                    "fmla v29.4s, v19.4s, v11.s[1]\n"
                    "fmla v30.4s, v19.4s, v14.s[1]\n"
                    "fmla v31.4s, v19.4s, v17.s[1]\n"
                    "fmla v26.4s, v20.4s, v2.s[2]\n"
                    "fmla v27.4s, v20.4s, v5.s[2]\n"
                    "fmla v28.4s, v20.4s, v8.s[2]\n"
                    "fmla v29.4s, v20.4s, v11.s[2]\n"
                    "fmla v30.4s, v20.4s, v14.s[2]\n"
                    "fmla v31.4s, v20.4s, v17.s[2]\n"
                    "b.ne 4b\n"
                    "3:\n"
                    "ld1r {v24.4s}, [%[minptr]]\n"
                    "ld1r {v25.4s}, [%[maxptr]]\n"
                    "ldr q18, [%[b_ptr0]]\n"
                    "ldr q19, [%[b_ptr0], #0x10]\n"
                    "fmax v26.4s, v26.4s, v24.4s\n"
                    "ldr q20, [%[b_ptr0], #0x20]\n"
                    "fmax v27.4s, v27.4s, v24.4s\n"
                    "ldr q21, [%[b_ptr0], #0x30]\n"
                    "fmax v28.4s, v28.4s, v24.4s\n"
                    "ldr q22, [%[b_ptr0], #0x40]\n"
                    "fmax v29.4s, v29.4s, v24.4s\n"
                    "ldr q23, [%[b_ptr0], #0x50]\n"
                    "fmin v26.4s, v26.4s, v25.4s\n"
                    "fmin v27.4s, v27.4s, v25.4s\n"
                    "fmin v28.4s, v28.4s, v25.4s\n"
                    "fmin v29.4s, v29.4s, v25.4s\n"
                    "str q26, [%[c_ptr0]]\n"
                    "fmax v30.4s, v30.4s, v24.4s\n"
                    "ldr q26, [%[biasptr]]\n"
                    "fmax v31.4s, v31.4s, v24.4s\n"
                    "ldr q24, [%[b_ptr0], #0x60]\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "str q27, [c_ptr1]\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v30.4s, v30.4s, v25.4s\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmin v31.4s, v31.4s, v25.4s\n"
                    "str q28, [c_ptr2]\n"
                    "mov v27.16b, v26.16b\n"
                    "ldr q25, [%[b_ptr0], #0x70]\n"
                    "mov v28.16b, v26.16b\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "str q29, [c_ptr3]\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "mov v29.16b, v26.16b\n"
                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                    "fmla v27.4s, v18.4s, v3.s[0]\n"
                    "str q30, [c_ptr4]\n"
                    "mov v30.16b, v26.16b\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "fmla v28.4s, v18.4s, v6.s[0]\n"
                    "str q31, [c_ptr5]\n"
                    "mov v31.16b, v26.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "fmla v26.4s, v18.4s, v0.s[0]\n"
                    "fmla v29.4s, v18.4s, v9.s[0]\n"
                    "fmla v30.4s, v18.4s, v12.s[0]\n"
                    "fmla v31.4s, v18.4s, v15.s[0]\n"
                    "ldr q18, [%[b_ptr0]]\n"
                    "fmla v26.4s, v19.4s, v0.s[1]\n"
                    "fmla v27.4s, v19.4s, v3.s[1]\n"
                    "fmla v28.4s, v19.4s, v6.s[1]\n"
                    "fmla v29.4s, v19.4s, v9.s[1]\n"
                    "fmla v30.4s, v19.4s, v12.s[1]\n"
                    "fmla v31.4s, v19.4s, v15.s[1]\n"
                    "ldr q19, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v20.4s, v0.s[2]\n"
                    "fmla v27.4s, v20.4s, v3.s[2]\n"
                    "fmla v28.4s, v20.4s, v6.s[2]\n"
                    "fmla v29.4s, v20.4s, v9.s[2]\n"
                    "fmla v30.4s, v20.4s, v12.s[2]\n"
                    "fmla v31.4s, v20.4s, v15.s[2]\n"
                    "ldr q20, [%[b_ptr0], #0x20]\n"
                    "fmla v26.4s, v21.4s, v0.s[3]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                    "fmla v27.4s, v21.4s, v3.s[3]\n"
                    "fmla v28.4s, v21.4s, v6.s[3]\n"
                    "fmla v29.4s, v21.4s, v9.s[3]\n"
                    "fmla v30.4s, v21.4s, v12.s[3]\n"
                    "fmla v31.4s, v21.4s, v15.s[3]\n"
                    "fmla v26.4s, v22.4s, v1.s[0]\n"
                    "fmla v27.4s, v22.4s, v4.s[0]\n"
                    "fmla v28.4s, v22.4s, v7.s[0]\n"
                    "fmla v29.4s, v22.4s, v10.s[0]\n"
                    "fmla v30.4s, v22.4s, v13.s[0]\n"
                    "fmla v31.4s, v22.4s, v16.s[0]\n"
                    "fmla v26.4s, v23.4s, v1.s[1]\n"
                    "fmla v27.4s, v23.4s, v4.s[1]\n"
                    "fmla v28.4s, v23.4s, v7.s[1]\n"
                    "fmla v29.4s, v23.4s, v10.s[1]\n"
                    "fmla v30.4s, v23.4s, v13.s[1]\n"
                    "fmla v31.4s, v23.4s, v16.s[1]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v7.s[2]\n"
                    "fmla v29.4s, v24.4s, v10.s[2]\n"
                    "fmla v30.4s, v24.4s, v13.s[2]\n"
                    "fmla v31.4s, v24.4s, v16.s[2]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v7.s[3]\n"
                    "fmla v29.4s, v25.4s, v10.s[3]\n"
                    "fmla v30.4s, v25.4s, v13.s[3]\n"
                    "fmla v31.4s, v25.4s, v16.s[3]\n"
                    "fmla v26.4s, v18.4s, v2.s[0]\n"
                    "fmla v27.4s, v18.4s, v5.s[0]\n"
                    "fmla v28.4s, v18.4s, v8.s[0]\n"
                    "fmla v29.4s, v18.4s, v11.s[0]\n"
                    "fmla v30.4s, v18.4s, v14.s[0]\n"
                    "fmla v31.4s, v18.4s, v17.s[0]\n"
                    "fmla v26.4s, v19.4s, v2.s[1]\n"
                    "fmla v27.4s, v19.4s, v5.s[1]\n"
                    "fmla v28.4s, v19.4s, v8.s[1]\n"
                    "fmla v29.4s, v19.4s, v11.s[1]\n"
                    "fmla v30.4s, v19.4s, v14.s[1]\n"
                    "fmla v31.4s, v19.4s, v17.s[1]\n"
                    "fmla v26.4s, v20.4s, v2.s[2]\n"
                    "fmla v27.4s, v20.4s, v5.s[2]\n"
                    "fmla v28.4s, v20.4s, v8.s[2]\n"
                    "fmla v29.4s, v20.4s, v11.s[2]\n"
                    "fmla v30.4s, v20.4s, v14.s[2]\n"
                    "fmla v31.4s, v20.4s, v17.s[2]\n"
                    "b 5f\n"
                    "2:\n"
                    "ldr q26, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "mov v27.16b, v26.16b\n"
                    "mov v28.16b, v26.16b\n"
                    "mov v29.16b, v26.16b\n"
                    "mov v30.16b, v26.16b\n"
                    "mov v31.16b, v26.16b\n"
                    "fmla v26.4s, v18.4s, v0.s[0]\n"
                    "fmla v27.4s, v18.4s, v3.s[0]\n"
                    "fmla v28.4s, v18.4s, v6.s[0]\n"
                    "fmla v29.4s, v18.4s, v9.s[0]\n"
                    "fmla v30.4s, v18.4s, v12.s[0]\n"
                    "fmla v31.4s, v18.4s, v15.s[0]\n"
                    "ldr q18, [%[b_ptr0]]\n"
                    "fmla v26.4s, v19.4s, v0.s[1]\n"
                    "fmla v27.4s, v19.4s, v3.s[1]\n"
                    "fmla v28.4s, v19.4s, v6.s[1]\n"
                    "fmla v29.4s, v19.4s, v9.s[1]\n"
                    "fmla v30.4s, v19.4s, v12.s[1]\n"
                    "fmla v31.4s, v19.4s, v15.s[1]\n"
                    "ldr q19, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v20.4s, v0.s[2]\n"
                    "fmla v27.4s, v20.4s, v3.s[2]\n"
                    "fmla v28.4s, v20.4s, v6.s[2]\n"
                    "fmla v29.4s, v20.4s, v9.s[2]\n"
                    "fmla v30.4s, v20.4s, v12.s[2]\n"
                    "fmla v31.4s, v20.4s, v15.s[2]\n"
                    "ldr q20, [%[b_ptr0], #0x20]\n"
                    "fmla v26.4s, v21.4s, v0.s[3]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                    "fmla v27.4s, v21.4s, v3.s[3]\n"
                    "fmla v28.4s, v21.4s, v6.s[3]\n"
                    "fmla v29.4s, v21.4s, v9.s[3]\n"
                    "fmla v30.4s, v21.4s, v12.s[3]\n"
                    "fmla v31.4s, v21.4s, v15.s[3]\n"
                    "fmla v26.4s, v22.4s, v1.s[0]\n"
                    "fmla v27.4s, v22.4s, v4.s[0]\n"
                    "fmla v28.4s, v22.4s, v7.s[0]\n"
                    "fmla v29.4s, v22.4s, v10.s[0]\n"
                    "fmla v30.4s, v22.4s, v13.s[0]\n"
                    "fmla v31.4s, v22.4s, v16.s[0]\n"
                    "fmla v26.4s, v23.4s, v1.s[1]\n"
                    "fmla v27.4s, v23.4s, v4.s[1]\n"
                    "fmla v28.4s, v23.4s, v7.s[1]\n"
                    "fmla v29.4s, v23.4s, v10.s[1]\n"
                    "fmla v30.4s, v23.4s, v13.s[1]\n"
                    "fmla v31.4s, v23.4s, v16.s[1]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v7.s[2]\n"
                    "fmla v29.4s, v24.4s, v10.s[2]\n"
                    "fmla v30.4s, v24.4s, v13.s[2]\n"
                    "fmla v31.4s, v24.4s, v16.s[2]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v7.s[3]\n"
                    "fmla v29.4s, v25.4s, v10.s[3]\n"
                    "fmla v30.4s, v25.4s, v13.s[3]\n"
                    "fmla v31.4s, v25.4s, v16.s[3]\n"
                    "fmla v26.4s, v18.4s, v2.s[0]\n"
                    "fmla v27.4s, v18.4s, v5.s[0]\n"
                    "fmla v28.4s, v18.4s, v8.s[0]\n"
                    "fmla v29.4s, v18.4s, v11.s[0]\n"
                    "fmla v30.4s, v18.4s, v14.s[0]\n"
                    "fmla v31.4s, v18.4s, v17.s[0]\n"
                    "fmla v26.4s, v19.4s, v2.s[1]\n"
                    "fmla v27.4s, v19.4s, v5.s[1]\n"
                    "fmla v28.4s, v19.4s, v8.s[1]\n"
                    "fmla v29.4s, v19.4s, v11.s[1]\n"
                    "fmla v30.4s, v19.4s, v14.s[1]\n"
                    "fmla v31.4s, v19.4s, v17.s[1]\n"
                    "fmla v26.4s, v20.4s, v2.s[2]\n"
                    "fmla v27.4s, v20.4s, v5.s[2]\n"
                    "fmla v28.4s, v20.4s, v8.s[2]\n"
                    "fmla v29.4s, v20.4s, v11.s[2]\n"
                    "fmla v30.4s, v20.4s, v14.s[2]\n"
                    "fmla v31.4s, v20.4s, v17.s[2]\n"
                    "5:\n"
                    "ld1r {v24.4s}, [%[minptr]]\n"
                    "ld1r {v25.4s}, [%[maxptr]]\n"
                    "fmax v26.4s, v26.4s, v24.4s\n"
                    "fmax v27.4s, v27.4s, v24.4s\n"
                    "fmax v28.4s, v28.4s, v24.4s\n"
                    "fmax v29.4s, v29.4s, v24.4s\n"
                    "fmin v26.4s, v26.4s, v25.4s\n"
                    "fmin v27.4s, v27.4s, v25.4s\n"
                    "fmin v28.4s, v28.4s, v25.4s\n"
                    "fmin v29.4s, v29.4s, v25.4s\n"
                    "str q26, [%[c_ptr0]]\n"
                    "fmax v30.4s, v30.4s, v24.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmax v31.4s, v31.4s, v24.4s\n"
                    "str q27, [c_ptr1]\n"
                    "fmin v30.4s, v30.4s, v25.4s\n"
                    "fmin v31.4s, v31.4s, v25.4s\n"
                    "str q28, [c_ptr2]\n"
                    "str q29, [c_ptr3]\n"
                    "str q30, [c_ptr4]\n"
                    "str q31, [c_ptr5]\n"
                    ".unreq a_ptr1\n"
                    ".unreq a_ptr2\n"
                    ".unreq a_ptr3\n"
                    ".unreq a_ptr4\n"
                    ".unreq a_ptr5\n"
                    ".unreq c_ptr1\n"
                    ".unreq c_ptr2\n"
                    ".unreq c_ptr3\n"
                    ".unreq c_ptr4\n"
                    ".unreq c_ptr5\n"
                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [biasptr] "+r" (biasptr)
                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
                );
                break;
            case 12:
                __asm __volatile (
                    "a_ptr1 .req X0\n"
                    "a_ptr2 .req X1\n"
                    "a_ptr3 .req X2\n"
                    "a_ptr4 .req X3\n"
                    "a_ptr5 .req X4\n"
                    "c_ptr1 .req X5\n"
                    "c_ptr2 .req X6\n"
                    "c_ptr3 .req X7\n"
                    "c_ptr4 .req X8\n"
                    "c_ptr5 .req X9\n"
                    "add a_ptr1, %[a_ptr0], %[lda]\n"
                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
                    "add a_ptr2, a_ptr1, %[lda]\n"
                    "add c_ptr2, c_ptr1, %[ldc]\n"
                    "add a_ptr3, a_ptr2, %[lda]\n"
                    "add c_ptr3, c_ptr2, %[ldc]\n"
                    "add a_ptr4, a_ptr3, %[lda]\n"
                    "add c_ptr4, c_ptr3, %[ldc]\n"
                    "add a_ptr5, a_ptr4, %[lda]\n"
                    "add c_ptr5, c_ptr4, %[ldc]\n"
                    "cbz %[oob_rows], 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr5, %[c_ptr0], #0x0\n"
                    "add a_ptr5, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr4, %[c_ptr0], #0x0\n"
                    "add a_ptr4, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr3, %[c_ptr0], #0x0\n"
                    "add a_ptr3, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr2, %[c_ptr0], #0x0\n"
                    "add a_ptr2, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr1, %[c_ptr0], #0x0\n"
                    "add a_ptr1, %[a_ptr0], #0x0\n"
                    "1:\n"
                    "ldr q0, [%[a_ptr0]], #0x10\n"
                    "ldr q3, [a_ptr1], #0x10\n"
                    "ldr q6, [a_ptr2], #0x10\n"
                    "ldr q9, [a_ptr3], #0x10\n"
                    "ldr q12, [a_ptr4], #0x10\n"
                    "ldr q15, [a_ptr5], #0x10\n"
                    "ldr q1, [%[a_ptr0]], #0x10\n"
                    "ldr q4, [a_ptr1], #0x10\n"
                    "ldr q7, [a_ptr2], #0x10\n"
                    "ldr q10, [a_ptr3], #0x10\n"
                    "ldr q13, [a_ptr4], #0x10\n"
                    "ldr q16, [a_ptr5], #0x10\n"
                    "ldr q2, [%[a_ptr0]]\n"
                    "ldr q5, [a_ptr1]\n"
                    "ldr q8, [a_ptr2]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                    "ldr q11, [a_ptr3]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                    "ldr q14, [a_ptr4]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                    "ldr q17, [a_ptr5]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                    "ldr q18, [%[b_ptr0]]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                    "ldr q19, [%[b_ptr0], #0x10]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                    "ldr q20, [%[b_ptr0], #0x20]\n"
                    "ldr q21, [%[b_ptr0], #0x30]\n"
                    "ldr q22, [%[b_ptr0], #0x40]\n"
                    "ldr q23, [%[b_ptr0], #0x50]\n"
                    "ldr q24, [%[b_ptr0], #0x60]\n"
                    "ldr q25, [%[b_ptr0], #0x70]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                    "cbz %[loops], 2f\n"
                    "ldr q26, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "mov v27.16b, v26.16b\n"
                    "mov v28.16b, v26.16b\n"
                    "mov v29.16b, v26.16b\n"
                    "mov v30.16b, v26.16b\n"
                    "mov v31.16b, v26.16b\n"
                    "fmla v26.4s, v18.4s, v0.s[0]\n"
                    "fmla v27.4s, v18.4s, v3.s[0]\n"
                    "fmla v28.4s, v18.4s, v6.s[0]\n"
                    "fmla v29.4s, v18.4s, v9.s[0]\n"
                    "fmla v30.4s, v18.4s, v12.s[0]\n"
                    "fmla v31.4s, v18.4s, v15.s[0]\n"
                    "ldr q18, [%[b_ptr0]]\n"
                    "fmla v26.4s, v19.4s, v0.s[1]\n"
                    "fmla v27.4s, v19.4s, v3.s[1]\n"
                    "fmla v28.4s, v19.4s, v6.s[1]\n"
                    "fmla v29.4s, v19.4s, v9.s[1]\n"
                    "fmla v30.4s, v19.4s, v12.s[1]\n"
                    "fmla v31.4s, v19.4s, v15.s[1]\n"
                    "ldr q19, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v20.4s, v0.s[2]\n"
                    "fmla v27.4s, v20.4s, v3.s[2]\n"
                    "fmla v28.4s, v20.4s, v6.s[2]\n"
                    "fmla v29.4s, v20.4s, v9.s[2]\n"
                    "fmla v30.4s, v20.4s, v12.s[2]\n"
                    "fmla v31.4s, v20.4s, v15.s[2]\n"
                    "ldr q20, [%[b_ptr0], #0x20]\n"
                    "fmla v26.4s, v21.4s, v0.s[3]\n"
                    "fmla v27.4s, v21.4s, v3.s[3]\n"
                    "fmla v28.4s, v21.4s, v6.s[3]\n"
                    "fmla v29.4s, v21.4s, v9.s[3]\n"
                    "fmla v30.4s, v21.4s, v12.s[3]\n"
                    "fmla v31.4s, v21.4s, v15.s[3]\n"
                    "ldr q21, [%[b_ptr0], #0x30]\n"
                    "fmla v26.4s, v22.4s, v1.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                    "fmla v27.4s, v22.4s, v4.s[0]\n"
                    "fmla v28.4s, v22.4s, v7.s[0]\n"
                    "fmla v29.4s, v22.4s, v10.s[0]\n"
                    "fmla v30.4s, v22.4s, v13.s[0]\n"
                    "fmla v31.4s, v22.4s, v16.s[0]\n"
                    "fmla v26.4s, v23.4s, v1.s[1]\n"
                    "fmla v27.4s, v23.4s, v4.s[1]\n"
                    "fmla v28.4s, v23.4s, v7.s[1]\n"
                    "fmla v29.4s, v23.4s, v10.s[1]\n"
                    "fmla v30.4s, v23.4s, v13.s[1]\n"
                    "fmla v31.4s, v23.4s, v16.s[1]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v7.s[2]\n"
                    "fmla v29.4s, v24.4s, v10.s[2]\n"
                    "fmla v30.4s, v24.4s, v13.s[2]\n"
                    "fmla v31.4s, v24.4s, v16.s[2]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v7.s[3]\n"
                    "fmla v29.4s, v25.4s, v10.s[3]\n"
                    "fmla v30.4s, v25.4s, v13.s[3]\n"
                    "fmla v31.4s, v25.4s, v16.s[3]\n"
                    "fmla v26.4s, v18.4s, v2.s[0]\n"
                    "fmla v27.4s, v18.4s, v5.s[0]\n"
                    "fmla v28.4s, v18.4s, v8.s[0]\n"
                    "fmla v29.4s, v18.4s, v11.s[0]\n"
                    "fmla v30.4s, v18.4s, v14.s[0]\n"
                    "fmla v31.4s, v18.4s, v17.s[0]\n"
                    "fmla v26.4s, v19.4s, v2.s[1]\n"
                    "fmla v27.4s, v19.4s, v5.s[1]\n"
                    "fmla v28.4s, v19.4s, v8.s[1]\n"
                    "fmla v29.4s, v19.4s, v11.s[1]\n"
                    "fmla v30.4s, v19.4s, v14.s[1]\n"
                    "fmla v31.4s, v19.4s, v17.s[1]\n"
                    "fmla v26.4s, v20.4s, v2.s[2]\n"
                    "fmla v27.4s, v20.4s, v5.s[2]\n"
                    "fmla v28.4s, v20.4s, v8.s[2]\n"
                    "fmla v29.4s, v20.4s, v11.s[2]\n"
                    "fmla v30.4s, v20.4s, v14.s[2]\n"
                    "fmla v31.4s, v20.4s, v17.s[2]\n"
                    "fmla v26.4s, v21.4s, v2.s[3]\n"
                    "fmla v27.4s, v21.4s, v5.s[3]\n"
                    "fmla v28.4s, v21.4s, v8.s[3]\n"
                    "fmla v29.4s, v21.4s, v11.s[3]\n"
                    "fmla v30.4s, v21.4s, v14.s[3]\n"
                    "fmla v31.4s, v21.4s, v17.s[3]\n"
                    "b.eq 3f\n"
                    "4:\n"
                    "ld1r {v24.4s}, [%[minptr]]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "ld1r {v25.4s}, [%[maxptr]]\n"
                    "ldr q18, [%[b_ptr0]]\n"
                    "fmax v26.4s, v26.4s, v24.4s\n"
                    "ldr q19, [%[b_ptr0], #0x10]\n"
                    "fmax v27.4s, v27.4s, v24.4s\n"
                    "ldr q20, [%[b_ptr0], #0x20]\n"
                    "fmax v28.4s, v28.4s, v24.4s\n"
                    "ldr q21, [%[b_ptr0], #0x30]\n"
                    "fmax v29.4s, v29.4s, v24.4s\n"
                    "ldr q22, [%[b_ptr0], #0x40]\n"
                    "fmin v26.4s, v26.4s, v25.4s\n"
                    "ldr q23, [%[b_ptr0], #0x50]\n"
                    "fmin v27.4s, v27.4s, v25.4s\n"
                    "fmin v28.4s, v28.4s, v25.4s\n"
                    "fmin v29.4s, v29.4s, v25.4s\n"
                    "str q26, [%[c_ptr0]]\n"
                    "fmax v30.4s, v30.4s, v24.4s\n"
                    "ldr q26, [%[biasptr]]\n"
                    "fmax v31.4s, v31.4s, v24.4s\n"
                    "ldr q24, [%[b_ptr0], #0x60]\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "str q27, [c_ptr1]\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v30.4s, v30.4s, v25.4s\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmin v31.4s, v31.4s, v25.4s\n"
                    "str q28, [c_ptr2]\n"
                    "mov v27.16b, v26.16b\n"
                    "ldr q25, [%[b_ptr0], #0x70]\n"
                    "mov v28.16b, v26.16b\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "str q29, [c_ptr3]\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "mov v29.16b, v26.16b\n"
                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                    "fmla v27.4s, v18.4s, v3.s[0]\n"
                    "str q30, [c_ptr4]\n"
                    "mov v30.16b, v26.16b\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "fmla v28.4s, v18.4s, v6.s[0]\n"
                    "str q31, [c_ptr5]\n"
                    "mov v31.16b, v26.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "fmla v26.4s, v18.4s, v0.s[0]\n"
                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                    "fmla v29.4s, v18.4s, v9.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                    "fmla v30.4s, v18.4s, v12.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                    "fmla v31.4s, v18.4s, v15.s[0]\n"
                    "ldr q18, [%[b_ptr0]]\n"
                    "fmla v26.4s, v19.4s, v0.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                    "fmla v27.4s, v19.4s, v3.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                    "fmla v28.4s, v19.4s, v6.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                    "fmla v29.4s, v19.4s, v9.s[1]\n"
                    "fmla v30.4s, v19.4s, v12.s[1]\n"
                    "fmla v31.4s, v19.4s, v15.s[1]\n"
                    "ldr q19, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v20.4s, v0.s[2]\n"
                    "fmla v27.4s, v20.4s, v3.s[2]\n"
                    "fmla v28.4s, v20.4s, v6.s[2]\n"
                    "fmla v29.4s, v20.4s, v9.s[2]\n"
                    "fmla v30.4s, v20.4s, v12.s[2]\n"
                    "fmla v31.4s, v20.4s, v15.s[2]\n"
                    "ldr q20, [%[b_ptr0], #0x20]\n"
                    "fmla v26.4s, v21.4s, v0.s[3]\n"
                    "fmla v27.4s, v21.4s, v3.s[3]\n"
                    "fmla v28.4s, v21.4s, v6.s[3]\n"
                    "fmla v29.4s, v21.4s, v9.s[3]\n"
                    "fmla v30.4s, v21.4s, v12.s[3]\n"
                    "fmla v31.4s, v21.4s, v15.s[3]\n"
                    "ldr q21, [%[b_ptr0], #0x30]\n"
                    "fmla v26.4s, v22.4s, v1.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                    "fmla v27.4s, v22.4s, v4.s[0]\n"
                    "fmla v28.4s, v22.4s, v7.s[0]\n"
                    "fmla v29.4s, v22.4s, v10.s[0]\n"
                    "fmla v30.4s, v22.4s, v13.s[0]\n"
                    "fmla v31.4s, v22.4s, v16.s[0]\n"
                    "fmla v26.4s, v23.4s, v1.s[1]\n"
                    "fmla v27.4s, v23.4s, v4.s[1]\n"
                    "fmla v28.4s, v23.4s, v7.s[1]\n"
                    "fmla v29.4s, v23.4s, v10.s[1]\n"
                    "fmla v30.4s, v23.4s, v13.s[1]\n"
                    "fmla v31.4s, v23.4s, v16.s[1]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v7.s[2]\n"
                    "fmla v29.4s, v24.4s, v10.s[2]\n"
                    "fmla v30.4s, v24.4s, v13.s[2]\n"
                    "fmla v31.4s, v24.4s, v16.s[2]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v7.s[3]\n"
                    "fmla v29.4s, v25.4s, v10.s[3]\n"
                    "fmla v30.4s, v25.4s, v13.s[3]\n"
                    "fmla v31.4s, v25.4s, v16.s[3]\n"
                    "fmla v26.4s, v18.4s, v2.s[0]\n"
                    "fmla v27.4s, v18.4s, v5.s[0]\n"
                    "fmla v28.4s, v18.4s, v8.s[0]\n"
                    "fmla v29.4s, v18.4s, v11.s[0]\n"
                    "fmla v30.4s, v18.4s, v14.s[0]\n"
                    "fmla v31.4s, v18.4s, v17.s[0]\n"
                    "fmla v26.4s, v19.4s, v2.s[1]\n"
                    "fmla v27.4s, v19.4s, v5.s[1]\n"
                    "fmla v28.4s, v19.4s, v8.s[1]\n"
                    "fmla v29.4s, v19.4s, v11.s[1]\n"
                    "fmla v30.4s, v19.4s, v14.s[1]\n"
                    "fmla v31.4s, v19.4s, v17.s[1]\n"
                    "fmla v26.4s, v20.4s, v2.s[2]\n"
                    "fmla v27.4s, v20.4s, v5.s[2]\n"
                    "fmla v28.4s, v20.4s, v8.s[2]\n"
                    "fmla v29.4s, v20.4s, v11.s[2]\n"
                    "fmla v30.4s, v20.4s, v14.s[2]\n"
                    "fmla v31.4s, v20.4s, v17.s[2]\n"
                    "fmla v26.4s, v21.4s, v2.s[3]\n"
                    "fmla v27.4s, v21.4s, v5.s[3]\n"
                    "fmla v28.4s, v21.4s, v8.s[3]\n"
                    "fmla v29.4s, v21.4s, v11.s[3]\n"
                    "fmla v30.4s, v21.4s, v14.s[3]\n"
                    "fmla v31.4s, v21.4s, v17.s[3]\n"
                    "b.ne 4b\n"
                    "3:\n"
                    "ld1r {v24.4s}, [%[minptr]]\n"
                    "ld1r {v25.4s}, [%[maxptr]]\n"
                    "ldr q18, [%[b_ptr0]]\n"
                    "ldr q19, [%[b_ptr0], #0x10]\n"
                    "fmax v26.4s, v26.4s, v24.4s\n"
                    "ldr q20, [%[b_ptr0], #0x20]\n"
                    "fmax v27.4s, v27.4s, v24.4s\n"
                    "ldr q21, [%[b_ptr0], #0x30]\n"
                    "fmax v28.4s, v28.4s, v24.4s\n"
                    "ldr q22, [%[b_ptr0], #0x40]\n"
                    "fmax v29.4s, v29.4s, v24.4s\n"
                    "ldr q23, [%[b_ptr0], #0x50]\n"
                    "fmin v26.4s, v26.4s, v25.4s\n"
                    "fmin v27.4s, v27.4s, v25.4s\n"
                    "fmin v28.4s, v28.4s, v25.4s\n"
                    "fmin v29.4s, v29.4s, v25.4s\n"
                    "str q26, [%[c_ptr0]]\n"
                    "fmax v30.4s, v30.4s, v24.4s\n"
                    "ldr q26, [%[biasptr]]\n"
                    "fmax v31.4s, v31.4s, v24.4s\n"
                    "ldr q24, [%[b_ptr0], #0x60]\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "str q27, [c_ptr1]\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v30.4s, v30.4s, v25.4s\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmin v31.4s, v31.4s, v25.4s\n"
                    "str q28, [c_ptr2]\n"
                    "mov v27.16b, v26.16b\n"
                    "ldr q25, [%[b_ptr0], #0x70]\n"
                    "mov v28.16b, v26.16b\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "str q29, [c_ptr3]\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "mov v29.16b, v26.16b\n"
                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                    "fmla v27.4s, v18.4s, v3.s[0]\n"
                    "str q30, [c_ptr4]\n"
                    "mov v30.16b, v26.16b\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "fmla v28.4s, v18.4s, v6.s[0]\n"
                    "str q31, [c_ptr5]\n"
                    "mov v31.16b, v26.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "fmla v26.4s, v18.4s, v0.s[0]\n"
                    "fmla v29.4s, v18.4s, v9.s[0]\n"
                    "fmla v30.4s, v18.4s, v12.s[0]\n"
                    "fmla v31.4s, v18.4s, v15.s[0]\n"
                    "ldr q18, [%[b_ptr0]]\n"
                    "fmla v26.4s, v19.4s, v0.s[1]\n"
                    "fmla v27.4s, v19.4s, v3.s[1]\n"
                    "fmla v28.4s, v19.4s, v6.s[1]\n"
                    "fmla v29.4s, v19.4s, v9.s[1]\n"
                    "fmla v30.4s, v19.4s, v12.s[1]\n"
                    "fmla v31.4s, v19.4s, v15.s[1]\n"
                    "ldr q19, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v20.4s, v0.s[2]\n"
                    "fmla v27.4s, v20.4s, v3.s[2]\n"
                    "fmla v28.4s, v20.4s, v6.s[2]\n"
                    "fmla v29.4s, v20.4s, v9.s[2]\n"
                    "fmla v30.4s, v20.4s, v12.s[2]\n"
                    "fmla v31.4s, v20.4s, v15.s[2]\n"
                    "ldr q20, [%[b_ptr0], #0x20]\n"
                    "fmla v26.4s, v21.4s, v0.s[3]\n"
                    "fmla v27.4s, v21.4s, v3.s[3]\n"
                    "fmla v28.4s, v21.4s, v6.s[3]\n"
                    "fmla v29.4s, v21.4s, v9.s[3]\n"
                    "fmla v30.4s, v21.4s, v12.s[3]\n"
                    "fmla v31.4s, v21.4s, v15.s[3]\n"
                    "ldr q21, [%[b_ptr0], #0x30]\n"
                    "fmla v26.4s, v22.4s, v1.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                    "fmla v27.4s, v22.4s, v4.s[0]\n"
                    "fmla v28.4s, v22.4s, v7.s[0]\n"
                    "fmla v29.4s, v22.4s, v10.s[0]\n"
                    "fmla v30.4s, v22.4s, v13.s[0]\n"
                    "fmla v31.4s, v22.4s, v16.s[0]\n"
                    "fmla v26.4s, v23.4s, v1.s[1]\n"
                    "fmla v27.4s, v23.4s, v4.s[1]\n"
                    "fmla v28.4s, v23.4s, v7.s[1]\n"
                    "fmla v29.4s, v23.4s, v10.s[1]\n"
                    "fmla v30.4s, v23.4s, v13.s[1]\n"
                    "fmla v31.4s, v23.4s, v16.s[1]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v7.s[2]\n"
                    "fmla v29.4s, v24.4s, v10.s[2]\n"
                    "fmla v30.4s, v24.4s, v13.s[2]\n"
                    "fmla v31.4s, v24.4s, v16.s[2]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v7.s[3]\n"
                    "fmla v29.4s, v25.4s, v10.s[3]\n"
                    "fmla v30.4s, v25.4s, v13.s[3]\n"
                    "fmla v31.4s, v25.4s, v16.s[3]\n"
                    "fmla v26.4s, v18.4s, v2.s[0]\n"
                    "fmla v27.4s, v18.4s, v5.s[0]\n"
                    "fmla v28.4s, v18.4s, v8.s[0]\n"
                    "fmla v29.4s, v18.4s, v11.s[0]\n"
                    "fmla v30.4s, v18.4s, v14.s[0]\n"
                    "fmla v31.4s, v18.4s, v17.s[0]\n"
                    "fmla v26.4s, v19.4s, v2.s[1]\n"
                    "fmla v27.4s, v19.4s, v5.s[1]\n"
                    "fmla v28.4s, v19.4s, v8.s[1]\n"
                    "fmla v29.4s, v19.4s, v11.s[1]\n"
                    "fmla v30.4s, v19.4s, v14.s[1]\n"
                    "fmla v31.4s, v19.4s, v17.s[1]\n"
                    "fmla v26.4s, v20.4s, v2.s[2]\n"
                    "fmla v27.4s, v20.4s, v5.s[2]\n"
                    "fmla v28.4s, v20.4s, v8.s[2]\n"
                    "fmla v29.4s, v20.4s, v11.s[2]\n"
                    "fmla v30.4s, v20.4s, v14.s[2]\n"
                    "fmla v31.4s, v20.4s, v17.s[2]\n"
                    "fmla v26.4s, v21.4s, v2.s[3]\n"
                    "fmla v27.4s, v21.4s, v5.s[3]\n"
                    "fmla v28.4s, v21.4s, v8.s[3]\n"
                    "fmla v29.4s, v21.4s, v11.s[3]\n"
                    "fmla v30.4s, v21.4s, v14.s[3]\n"
                    "fmla v31.4s, v21.4s, v17.s[3]\n"
                    "b 5f\n"
                    "2:\n"
                    "ldr q26, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "mov v27.16b, v26.16b\n"
                    "mov v28.16b, v26.16b\n"
                    "mov v29.16b, v26.16b\n"
                    "mov v30.16b, v26.16b\n"
                    "mov v31.16b, v26.16b\n"
                    "fmla v26.4s, v18.4s, v0.s[0]\n"
                    "fmla v27.4s, v18.4s, v3.s[0]\n"
                    "fmla v28.4s, v18.4s, v6.s[0]\n"
                    "fmla v29.4s, v18.4s, v9.s[0]\n"
                    "fmla v30.4s, v18.4s, v12.s[0]\n"
                    "fmla v31.4s, v18.4s, v15.s[0]\n"
                    "ldr q18, [%[b_ptr0]]\n"
                    "fmla v26.4s, v19.4s, v0.s[1]\n"
                    "fmla v27.4s, v19.4s, v3.s[1]\n"
                    "fmla v28.4s, v19.4s, v6.s[1]\n"
                    "fmla v29.4s, v19.4s, v9.s[1]\n"
                    "fmla v30.4s, v19.4s, v12.s[1]\n"
                    "fmla v31.4s, v19.4s, v15.s[1]\n"
                    "ldr q19, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v20.4s, v0.s[2]\n"
                    "fmla v27.4s, v20.4s, v3.s[2]\n"
                    "fmla v28.4s, v20.4s, v6.s[2]\n"
                    "fmla v29.4s, v20.4s, v9.s[2]\n"
                    "fmla v30.4s, v20.4s, v12.s[2]\n"
                    "fmla v31.4s, v20.4s, v15.s[2]\n"
                    "ldr q20, [%[b_ptr0], #0x20]\n"
                    "fmla v26.4s, v21.4s, v0.s[3]\n"
                    "fmla v27.4s, v21.4s, v3.s[3]\n"
                    "fmla v28.4s, v21.4s, v6.s[3]\n"
                    "fmla v29.4s, v21.4s, v9.s[3]\n"
                    "fmla v30.4s, v21.4s, v12.s[3]\n"
                    "fmla v31.4s, v21.4s, v15.s[3]\n"
                    "ldr q21, [%[b_ptr0], #0x30]\n"
                    "fmla v26.4s, v22.4s, v1.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                    "fmla v27.4s, v22.4s, v4.s[0]\n"
                    "fmla v28.4s, v22.4s, v7.s[0]\n"
                    "fmla v29.4s, v22.4s, v10.s[0]\n"
                    "fmla v30.4s, v22.4s, v13.s[0]\n"
                    "fmla v31.4s, v22.4s, v16.s[0]\n"
                    "fmla v26.4s, v23.4s, v1.s[1]\n"
                    "fmla v27.4s, v23.4s, v4.s[1]\n"
                    "fmla v28.4s, v23.4s, v7.s[1]\n"
                    "fmla v29.4s, v23.4s, v10.s[1]\n"
                    "fmla v30.4s, v23.4s, v13.s[1]\n"
                    "fmla v31.4s, v23.4s, v16.s[1]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v7.s[2]\n"
                    "fmla v29.4s, v24.4s, v10.s[2]\n"
                    "fmla v30.4s, v24.4s, v13.s[2]\n"
                    "fmla v31.4s, v24.4s, v16.s[2]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v7.s[3]\n"
                    "fmla v29.4s, v25.4s, v10.s[3]\n"
                    "fmla v30.4s, v25.4s, v13.s[3]\n"
                    "fmla v31.4s, v25.4s, v16.s[3]\n"
                    "fmla v26.4s, v18.4s, v2.s[0]\n"
                    "fmla v27.4s, v18.4s, v5.s[0]\n"
                    "fmla v28.4s, v18.4s, v8.s[0]\n"
                    "fmla v29.4s, v18.4s, v11.s[0]\n"
                    "fmla v30.4s, v18.4s, v14.s[0]\n"
                    "fmla v31.4s, v18.4s, v17.s[0]\n"
                    "fmla v26.4s, v19.4s, v2.s[1]\n"
                    "fmla v27.4s, v19.4s, v5.s[1]\n"
                    "fmla v28.4s, v19.4s, v8.s[1]\n"
                    "fmla v29.4s, v19.4s, v11.s[1]\n"
                    "fmla v30.4s, v19.4s, v14.s[1]\n"
                    "fmla v31.4s, v19.4s, v17.s[1]\n"
                    "fmla v26.4s, v20.4s, v2.s[2]\n"
                    "fmla v27.4s, v20.4s, v5.s[2]\n"
                    "fmla v28.4s, v20.4s, v8.s[2]\n"
                    "fmla v29.4s, v20.4s, v11.s[2]\n"
                    "fmla v30.4s, v20.4s, v14.s[2]\n"
                    "fmla v31.4s, v20.4s, v17.s[2]\n"
                    "fmla v26.4s, v21.4s, v2.s[3]\n"
                    "fmla v27.4s, v21.4s, v5.s[3]\n"
                    "fmla v28.4s, v21.4s, v8.s[3]\n"
                    "fmla v29.4s, v21.4s, v11.s[3]\n"
                    "fmla v30.4s, v21.4s, v14.s[3]\n"
                    "fmla v31.4s, v21.4s, v17.s[3]\n"
                    "5:\n"
                    "ld1r {v24.4s}, [%[minptr]]\n"
                    "ld1r {v25.4s}, [%[maxptr]]\n"
                    "fmax v26.4s, v26.4s, v24.4s\n"
                    "fmax v27.4s, v27.4s, v24.4s\n"
                    "fmax v28.4s, v28.4s, v24.4s\n"
                    "fmax v29.4s, v29.4s, v24.4s\n"
                    "fmin v26.4s, v26.4s, v25.4s\n"
                    "fmin v27.4s, v27.4s, v25.4s\n"
                    "fmin v28.4s, v28.4s, v25.4s\n"
                    "fmin v29.4s, v29.4s, v25.4s\n"
                    "str q26, [%[c_ptr0]]\n"
                    "fmax v30.4s, v30.4s, v24.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmax v31.4s, v31.4s, v24.4s\n"
                    "str q27, [c_ptr1]\n"
                    "fmin v30.4s, v30.4s, v25.4s\n"
                    "fmin v31.4s, v31.4s, v25.4s\n"
                    "str q28, [c_ptr2]\n"
                    "str q29, [c_ptr3]\n"
                    "str q30, [c_ptr4]\n"
                    "str q31, [c_ptr5]\n"
                    ".unreq a_ptr1\n"
                    ".unreq a_ptr2\n"
                    ".unreq a_ptr3\n"
                    ".unreq a_ptr4\n"
                    ".unreq a_ptr5\n"
                    ".unreq c_ptr1\n"
                    ".unreq c_ptr2\n"
                    ".unreq c_ptr3\n"
                    ".unreq c_ptr4\n"
                    ".unreq c_ptr5\n"
                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [biasptr] "+r" (biasptr)
                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
                );
                break;
            case 13:
                __asm __volatile (
                    "a_ptr1 .req X0\n"
                    "a_ptr2 .req X1\n"
                    "a_ptr3 .req X2\n"
                    "a_ptr4 .req X3\n"
                    "a_ptr5 .req X4\n"
                    "c_ptr1 .req X5\n"
                    "c_ptr2 .req X6\n"
                    "c_ptr3 .req X7\n"
                    "c_ptr4 .req X8\n"
                    "c_ptr5 .req X9\n"
                    "add a_ptr1, %[a_ptr0], %[lda]\n"
                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
                    "add a_ptr2, a_ptr1, %[lda]\n"
                    "add c_ptr2, c_ptr1, %[ldc]\n"
                    "add a_ptr3, a_ptr2, %[lda]\n"
                    "add c_ptr3, c_ptr2, %[ldc]\n"
                    "add a_ptr4, a_ptr3, %[lda]\n"
                    "add c_ptr4, c_ptr3, %[ldc]\n"
                    "add a_ptr5, a_ptr4, %[lda]\n"
                    "add c_ptr5, c_ptr4, %[ldc]\n"
                    "cbz %[oob_rows], 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr5, %[c_ptr0], #0x0\n"
                    "add a_ptr5, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr4, %[c_ptr0], #0x0\n"
                    "add a_ptr4, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr3, %[c_ptr0], #0x0\n"
                    "add a_ptr3, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr2, %[c_ptr0], #0x0\n"
                    "add a_ptr2, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr1, %[c_ptr0], #0x0\n"
                    "add a_ptr1, %[a_ptr0], #0x0\n"
                    "1:\n"
                    "ldr q0, [%[a_ptr0]], #0x10\n"
                    "ldr q4, [a_ptr1], #0x10\n"
                    "ldr q8, [a_ptr2], #0x10\n"
                    "ldr q12, [a_ptr3], #0x10\n"
                    "ldr q16, [a_ptr4], #0x10\n"
                    "ldr q20, [a_ptr5], #0x10\n"
                    "ldr q1, [%[a_ptr0]], #0x10\n"
                    "ldr q5, [a_ptr1], #0x10\n"
                    "ldr q9, [a_ptr2], #0x10\n"
                    "ldr q13, [a_ptr3], #0x10\n"
                    "ldr q17, [a_ptr4], #0x10\n"
                    "ldr q21, [a_ptr5], #0x10\n"
                    "ldr q2, [%[a_ptr0]], #0x10\n"
                    "ldr q6, [a_ptr1], #0x10\n"
                    "ldr q10, [a_ptr2], #0x10\n"
                    "ldr q14, [a_ptr3], #0x10\n"
                    "ldr s3, [%[a_ptr0]]\n"
                    "ldr q18, [a_ptr4], #0x10\n"
                    "ldr s7, [a_ptr1]\n"
                    "ldr q22, [a_ptr5], #0x10\n"
                    "ldr s11, [a_ptr2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "ldr s15, [a_ptr3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "ldr s19, [a_ptr4]\n"
                    "ldr s23, [a_ptr5]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "cbz %[loops], 2f\n"
                    "ldr q26, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "mov v27.16b, v26.16b\n"
                    "mov v28.16b, v26.16b\n"
                    "mov v29.16b, v26.16b\n"
                    "mov v30.16b, v26.16b\n"
                    "mov v31.16b, v26.16b\n"
                    "fmla v26.4s, v24.4s, v0.s[0]\n"
                    "fmla v27.4s, v24.4s, v4.s[0]\n"
                    "fmla v28.4s, v24.4s, v8.s[0]\n"
                    "fmla v29.4s, v24.4s, v12.s[0]\n"
                    "fmla v30.4s, v24.4s, v16.s[0]\n"
                    "fmla v31.4s, v24.4s, v20.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[1]\n"
                    "fmla v27.4s, v25.4s, v4.s[1]\n"
                    "fmla v28.4s, v25.4s, v8.s[1]\n"
                    "fmla v29.4s, v25.4s, v12.s[1]\n"
                    "fmla v30.4s, v25.4s, v16.s[1]\n"
                    "fmla v31.4s, v25.4s, v20.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v0.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v8.s[2]\n"
                    "fmla v29.4s, v24.4s, v12.s[2]\n"
                    "fmla v30.4s, v24.4s, v16.s[2]\n"
                    "fmla v31.4s, v24.4s, v20.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v8.s[3]\n"
                    "fmla v29.4s, v25.4s, v12.s[3]\n"
                    "fmla v30.4s, v25.4s, v16.s[3]\n"
                    "fmla v31.4s, v25.4s, v20.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[0]\n"
                    "fmla v28.4s, v24.4s, v9.s[0]\n"
                    "fmla v29.4s, v24.4s, v13.s[0]\n"
                    "fmla v30.4s, v24.4s, v17.s[0]\n"
                    "fmla v31.4s, v24.4s, v21.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[1]\n"
                    "fmla v27.4s, v25.4s, v5.s[1]\n"
                    "fmla v28.4s, v25.4s, v9.s[1]\n"
                    "fmla v29.4s, v25.4s, v13.s[1]\n"
                    "fmla v30.4s, v25.4s, v17.s[1]\n"
                    "fmla v31.4s, v25.4s, v21.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[2]\n"
                    "fmla v28.4s, v24.4s, v9.s[2]\n"
                    "fmla v29.4s, v24.4s, v13.s[2]\n"
                    "fmla v30.4s, v24.4s, v17.s[2]\n"
                    "fmla v31.4s, v24.4s, v21.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v5.s[3]\n"
                    "fmla v28.4s, v25.4s, v9.s[3]\n"
                    "fmla v29.4s, v25.4s, v13.s[3]\n"
                    "fmla v30.4s, v25.4s, v17.s[3]\n"
                    "fmla v31.4s, v25.4s, v21.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[0]\n"
                    "fmla v28.4s, v24.4s, v10.s[0]\n"
                    "fmla v29.4s, v24.4s, v14.s[0]\n"
                    "fmla v30.4s, v24.4s, v18.s[0]\n"
                    "fmla v31.4s, v24.4s, v22.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[1]\n"
                    "fmla v27.4s, v25.4s, v6.s[1]\n"
                    "fmla v28.4s, v25.4s, v10.s[1]\n"
                    "fmla v29.4s, v25.4s, v14.s[1]\n"
                    "fmla v30.4s, v25.4s, v18.s[1]\n"
                    "fmla v31.4s, v25.4s, v22.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[2]\n"
                    "fmla v28.4s, v24.4s, v10.s[2]\n"
                    "fmla v29.4s, v24.4s, v14.s[2]\n"
                    "fmla v30.4s, v24.4s, v18.s[2]\n"
                    "fmla v31.4s, v24.4s, v22.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[3]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                    "fmla v27.4s, v25.4s, v6.s[3]\n"
                    "fmla v28.4s, v25.4s, v10.s[3]\n"
                    "fmla v29.4s, v25.4s, v14.s[3]\n"
                    "fmla v30.4s, v25.4s, v18.s[3]\n"
                    "fmla v31.4s, v25.4s, v22.s[3]\n"
                    "fmla v26.4s, v24.4s, v3.s[0]\n"
                    "fmla v27.4s, v24.4s, v7.s[0]\n"
                    "fmla v28.4s, v24.4s, v11.s[0]\n"
                    "fmla v29.4s, v24.4s, v15.s[0]\n"
                    "fmla v30.4s, v24.4s, v19.s[0]\n"
                    "fmla v31.4s, v24.4s, v23.s[0]\n"
                    "b.eq 3f\n"
                    "4:\n"
                    "ld1r {v24.4s}, [%[minptr]]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "ld1r {v25.4s}, [%[maxptr]]\n"
                    "fmax v26.4s, v26.4s, v24.4s\n"
                    "fmax v27.4s, v27.4s, v24.4s\n"
                    "fmax v28.4s, v28.4s, v24.4s\n"
                    "fmax v29.4s, v29.4s, v24.4s\n"
                    "fmin v26.4s, v26.4s, v25.4s\n"
                    "fmin v27.4s, v27.4s, v25.4s\n"
                    "fmin v28.4s, v28.4s, v25.4s\n"
                    "fmin v29.4s, v29.4s, v25.4s\n"
                    "str q26, [%[c_ptr0]]\n"
                    "fmax v30.4s, v30.4s, v24.4s\n"
                    "ldr q26, [%[biasptr]]\n"
                    "fmax v31.4s, v31.4s, v24.4s\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "str q27, [c_ptr1]\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v30.4s, v30.4s, v25.4s\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmin v31.4s, v31.4s, v25.4s\n"
                    "str q28, [c_ptr2]\n"
                    "mov v27.16b, v26.16b\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "mov v28.16b, v26.16b\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "str q29, [c_ptr3]\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "mov v29.16b, v26.16b\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v4.s[0]\n"
                    "str q30, [c_ptr4]\n"
                    "mov v30.16b, v26.16b\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "fmla v28.4s, v24.4s, v8.s[0]\n"
                    "str q31, [c_ptr5]\n"
                    "mov v31.16b, v26.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "fmla v26.4s, v24.4s, v0.s[0]\n"
                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                    "fmla v29.4s, v24.4s, v12.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                    "fmla v30.4s, v24.4s, v16.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                    "fmla v31.4s, v24.4s, v20.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                    "fmla v27.4s, v25.4s, v4.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                    "fmla v28.4s, v25.4s, v8.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                    "fmla v29.4s, v25.4s, v12.s[1]\n"
                    "fmla v30.4s, v25.4s, v16.s[1]\n"
                    "fmla v31.4s, v25.4s, v20.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v0.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v8.s[2]\n"
                    "fmla v29.4s, v24.4s, v12.s[2]\n"
                    "fmla v30.4s, v24.4s, v16.s[2]\n"
                    "fmla v31.4s, v24.4s, v20.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v8.s[3]\n"
                    "fmla v29.4s, v25.4s, v12.s[3]\n"
                    "fmla v30.4s, v25.4s, v16.s[3]\n"
                    "fmla v31.4s, v25.4s, v20.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[0]\n"
                    "fmla v28.4s, v24.4s, v9.s[0]\n"
                    "fmla v29.4s, v24.4s, v13.s[0]\n"
                    "fmla v30.4s, v24.4s, v17.s[0]\n"
                    "fmla v31.4s, v24.4s, v21.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[1]\n"
                    "fmla v27.4s, v25.4s, v5.s[1]\n"
                    "fmla v28.4s, v25.4s, v9.s[1]\n"
                    "fmla v29.4s, v25.4s, v13.s[1]\n"
                    "fmla v30.4s, v25.4s, v17.s[1]\n"
                    "fmla v31.4s, v25.4s, v21.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[2]\n"
                    "fmla v28.4s, v24.4s, v9.s[2]\n"
                    "fmla v29.4s, v24.4s, v13.s[2]\n"
                    "fmla v30.4s, v24.4s, v17.s[2]\n"
                    "fmla v31.4s, v24.4s, v21.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v5.s[3]\n"
                    "fmla v28.4s, v25.4s, v9.s[3]\n"
                    "fmla v29.4s, v25.4s, v13.s[3]\n"
                    "fmla v30.4s, v25.4s, v17.s[3]\n"
                    "fmla v31.4s, v25.4s, v21.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[0]\n"
                    "fmla v28.4s, v24.4s, v10.s[0]\n"
                    "fmla v29.4s, v24.4s, v14.s[0]\n"
                    "fmla v30.4s, v24.4s, v18.s[0]\n"
                    "fmla v31.4s, v24.4s, v22.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[1]\n"
                    "fmla v27.4s, v25.4s, v6.s[1]\n"
                    "fmla v28.4s, v25.4s, v10.s[1]\n"
                    "fmla v29.4s, v25.4s, v14.s[1]\n"
                    "fmla v30.4s, v25.4s, v18.s[1]\n"
                    "fmla v31.4s, v25.4s, v22.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[2]\n"
                    "fmla v28.4s, v24.4s, v10.s[2]\n"
                    "fmla v29.4s, v24.4s, v14.s[2]\n"
                    "fmla v30.4s, v24.4s, v18.s[2]\n"
                    "fmla v31.4s, v24.4s, v22.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[3]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                    "fmla v27.4s, v25.4s, v6.s[3]\n"
                    "fmla v28.4s, v25.4s, v10.s[3]\n"
                    "fmla v29.4s, v25.4s, v14.s[3]\n"
                    "fmla v30.4s, v25.4s, v18.s[3]\n"
                    "fmla v31.4s, v25.4s, v22.s[3]\n"
                    "fmla v26.4s, v24.4s, v3.s[0]\n"
                    "fmla v27.4s, v24.4s, v7.s[0]\n"
                    "fmla v28.4s, v24.4s, v11.s[0]\n"
                    "fmla v29.4s, v24.4s, v15.s[0]\n"
                    "fmla v30.4s, v24.4s, v19.s[0]\n"
                    "fmla v31.4s, v24.4s, v23.s[0]\n"
                    "b.ne 4b\n"
                    "3:\n"
                    "ld1r {v24.4s}, [%[minptr]]\n"
                    "ld1r {v25.4s}, [%[maxptr]]\n"
                    "fmax v26.4s, v26.4s, v24.4s\n"
                    "fmax v27.4s, v27.4s, v24.4s\n"
                    "fmax v28.4s, v28.4s, v24.4s\n"
                    "fmax v29.4s, v29.4s, v24.4s\n"
                    "fmin v26.4s, v26.4s, v25.4s\n"
                    "fmin v27.4s, v27.4s, v25.4s\n"
                    "fmin v28.4s, v28.4s, v25.4s\n"
                    "fmin v29.4s, v29.4s, v25.4s\n"
                    "str q26, [%[c_ptr0]]\n"
                    "fmax v30.4s, v30.4s, v24.4s\n"
                    "ldr q26, [%[biasptr]]\n"
                    "fmax v31.4s, v31.4s, v24.4s\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "str q27, [c_ptr1]\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v30.4s, v30.4s, v25.4s\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmin v31.4s, v31.4s, v25.4s\n"
                    "str q28, [c_ptr2]\n"
                    "mov v27.16b, v26.16b\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "mov v28.16b, v26.16b\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "str q29, [c_ptr3]\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "mov v29.16b, v26.16b\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v4.s[0]\n"
                    "str q30, [c_ptr4]\n"
                    "mov v30.16b, v26.16b\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "fmla v28.4s, v24.4s, v8.s[0]\n"
                    "str q31, [c_ptr5]\n"
                    "mov v31.16b, v26.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "fmla v26.4s, v24.4s, v0.s[0]\n"
                    "fmla v29.4s, v24.4s, v12.s[0]\n"
                    "fmla v30.4s, v24.4s, v16.s[0]\n"
                    "fmla v31.4s, v24.4s, v20.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[1]\n"
                    "fmla v27.4s, v25.4s, v4.s[1]\n"
                    "fmla v28.4s, v25.4s, v8.s[1]\n"
                    "fmla v29.4s, v25.4s, v12.s[1]\n"
                    "fmla v30.4s, v25.4s, v16.s[1]\n"
                    "fmla v31.4s, v25.4s, v20.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v0.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v8.s[2]\n"
                    "fmla v29.4s, v24.4s, v12.s[2]\n"
                    "fmla v30.4s, v24.4s, v16.s[2]\n"
                    "fmla v31.4s, v24.4s, v20.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v8.s[3]\n"
                    "fmla v29.4s, v25.4s, v12.s[3]\n"
                    "fmla v30.4s, v25.4s, v16.s[3]\n"
                    "fmla v31.4s, v25.4s, v20.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[0]\n"
                    "fmla v28.4s, v24.4s, v9.s[0]\n"
                    "fmla v29.4s, v24.4s, v13.s[0]\n"
                    "fmla v30.4s, v24.4s, v17.s[0]\n"
                    "fmla v31.4s, v24.4s, v21.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[1]\n"
                    "fmla v27.4s, v25.4s, v5.s[1]\n"
                    "fmla v28.4s, v25.4s, v9.s[1]\n"
                    "fmla v29.4s, v25.4s, v13.s[1]\n"
                    "fmla v30.4s, v25.4s, v17.s[1]\n"
                    "fmla v31.4s, v25.4s, v21.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[2]\n"
                    "fmla v28.4s, v24.4s, v9.s[2]\n"
                    "fmla v29.4s, v24.4s, v13.s[2]\n"
                    "fmla v30.4s, v24.4s, v17.s[2]\n"
                    "fmla v31.4s, v24.4s, v21.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v5.s[3]\n"
                    "fmla v28.4s, v25.4s, v9.s[3]\n"
                    "fmla v29.4s, v25.4s, v13.s[3]\n"
                    "fmla v30.4s, v25.4s, v17.s[3]\n"
                    "fmla v31.4s, v25.4s, v21.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[0]\n"
                    "fmla v28.4s, v24.4s, v10.s[0]\n"
                    "fmla v29.4s, v24.4s, v14.s[0]\n"
                    "fmla v30.4s, v24.4s, v18.s[0]\n"
                    "fmla v31.4s, v24.4s, v22.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[1]\n"
                    "fmla v27.4s, v25.4s, v6.s[1]\n"
                    "fmla v28.4s, v25.4s, v10.s[1]\n"
                    "fmla v29.4s, v25.4s, v14.s[1]\n"
                    "fmla v30.4s, v25.4s, v18.s[1]\n"
                    "fmla v31.4s, v25.4s, v22.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[2]\n"
                    "fmla v28.4s, v24.4s, v10.s[2]\n"
                    "fmla v29.4s, v24.4s, v14.s[2]\n"
                    "fmla v30.4s, v24.4s, v18.s[2]\n"
                    "fmla v31.4s, v24.4s, v22.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[3]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                    "fmla v27.4s, v25.4s, v6.s[3]\n"
                    "fmla v28.4s, v25.4s, v10.s[3]\n"
                    "fmla v29.4s, v25.4s, v14.s[3]\n"
                    "fmla v30.4s, v25.4s, v18.s[3]\n"
                    "fmla v31.4s, v25.4s, v22.s[3]\n"
                    "fmla v26.4s, v24.4s, v3.s[0]\n"
                    "fmla v27.4s, v24.4s, v7.s[0]\n"
                    "fmla v28.4s, v24.4s, v11.s[0]\n"
                    "fmla v29.4s, v24.4s, v15.s[0]\n"
                    "fmla v30.4s, v24.4s, v19.s[0]\n"
                    "fmla v31.4s, v24.4s, v23.s[0]\n"
                    "b 5f\n"
                    "2:\n"
                    "ldr q26, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "mov v27.16b, v26.16b\n"
                    "mov v28.16b, v26.16b\n"
                    "mov v29.16b, v26.16b\n"
                    "mov v30.16b, v26.16b\n"
                    "mov v31.16b, v26.16b\n"
                    "fmla v26.4s, v24.4s, v0.s[0]\n"
                    "fmla v27.4s, v24.4s, v4.s[0]\n"
                    "fmla v28.4s, v24.4s, v8.s[0]\n"
                    "fmla v29.4s, v24.4s, v12.s[0]\n"
                    "fmla v30.4s, v24.4s, v16.s[0]\n"
                    "fmla v31.4s, v24.4s, v20.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[1]\n"
                    "fmla v27.4s, v25.4s, v4.s[1]\n"
                    "fmla v28.4s, v25.4s, v8.s[1]\n"
                    "fmla v29.4s, v25.4s, v12.s[1]\n"
                    "fmla v30.4s, v25.4s, v16.s[1]\n"
                    "fmla v31.4s, v25.4s, v20.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v0.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v8.s[2]\n"
                    "fmla v29.4s, v24.4s, v12.s[2]\n"
                    "fmla v30.4s, v24.4s, v16.s[2]\n"
                    "fmla v31.4s, v24.4s, v20.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v8.s[3]\n"
                    "fmla v29.4s, v25.4s, v12.s[3]\n"
                    "fmla v30.4s, v25.4s, v16.s[3]\n"
                    "fmla v31.4s, v25.4s, v20.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[0]\n"
                    "fmla v28.4s, v24.4s, v9.s[0]\n"
                    "fmla v29.4s, v24.4s, v13.s[0]\n"
                    "fmla v30.4s, v24.4s, v17.s[0]\n"
                    "fmla v31.4s, v24.4s, v21.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[1]\n"
                    "fmla v27.4s, v25.4s, v5.s[1]\n"
                    "fmla v28.4s, v25.4s, v9.s[1]\n"
                    "fmla v29.4s, v25.4s, v13.s[1]\n"
                    "fmla v30.4s, v25.4s, v17.s[1]\n"
                    "fmla v31.4s, v25.4s, v21.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[2]\n"
                    "fmla v28.4s, v24.4s, v9.s[2]\n"
                    "fmla v29.4s, v24.4s, v13.s[2]\n"
                    "fmla v30.4s, v24.4s, v17.s[2]\n"
                    "fmla v31.4s, v24.4s, v21.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v5.s[3]\n"
                    "fmla v28.4s, v25.4s, v9.s[3]\n"
                    "fmla v29.4s, v25.4s, v13.s[3]\n"
                    "fmla v30.4s, v25.4s, v17.s[3]\n"
                    "fmla v31.4s, v25.4s, v21.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[0]\n"
                    "fmla v28.4s, v24.4s, v10.s[0]\n"
                    "fmla v29.4s, v24.4s, v14.s[0]\n"
                    "fmla v30.4s, v24.4s, v18.s[0]\n"
                    "fmla v31.4s, v24.4s, v22.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[1]\n"
                    "fmla v27.4s, v25.4s, v6.s[1]\n"
                    "fmla v28.4s, v25.4s, v10.s[1]\n"
                    "fmla v29.4s, v25.4s, v14.s[1]\n"
                    "fmla v30.4s, v25.4s, v18.s[1]\n"
                    "fmla v31.4s, v25.4s, v22.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[2]\n"
                    "fmla v28.4s, v24.4s, v10.s[2]\n"
                    "fmla v29.4s, v24.4s, v14.s[2]\n"
                    "fmla v30.4s, v24.4s, v18.s[2]\n"
                    "fmla v31.4s, v24.4s, v22.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[3]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                    "fmla v27.4s, v25.4s, v6.s[3]\n"
                    "fmla v28.4s, v25.4s, v10.s[3]\n"
                    "fmla v29.4s, v25.4s, v14.s[3]\n"
                    "fmla v30.4s, v25.4s, v18.s[3]\n"
                    "fmla v31.4s, v25.4s, v22.s[3]\n"
                    "fmla v26.4s, v24.4s, v3.s[0]\n"
                    "fmla v27.4s, v24.4s, v7.s[0]\n"
                    "fmla v28.4s, v24.4s, v11.s[0]\n"
                    "fmla v29.4s, v24.4s, v15.s[0]\n"
                    "fmla v30.4s, v24.4s, v19.s[0]\n"
                    "fmla v31.4s, v24.4s, v23.s[0]\n"
                    "5:\n"
                    "ld1r {v24.4s}, [%[minptr]]\n"
                    "ld1r {v25.4s}, [%[maxptr]]\n"
                    "fmax v26.4s, v26.4s, v24.4s\n"
                    "fmax v27.4s, v27.4s, v24.4s\n"
                    "fmax v28.4s, v28.4s, v24.4s\n"
                    "fmax v29.4s, v29.4s, v24.4s\n"
                    "fmin v26.4s, v26.4s, v25.4s\n"
                    "fmin v27.4s, v27.4s, v25.4s\n"
                    "fmin v28.4s, v28.4s, v25.4s\n"
                    "fmin v29.4s, v29.4s, v25.4s\n"
                    "str q26, [%[c_ptr0]]\n"
                    "fmax v30.4s, v30.4s, v24.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmax v31.4s, v31.4s, v24.4s\n"
                    "str q27, [c_ptr1]\n"
                    "fmin v30.4s, v30.4s, v25.4s\n"
                    "fmin v31.4s, v31.4s, v25.4s\n"
                    "str q28, [c_ptr2]\n"
                    "str q29, [c_ptr3]\n"
                    "str q30, [c_ptr4]\n"
                    "str q31, [c_ptr5]\n"
                    ".unreq a_ptr1\n"
                    ".unreq a_ptr2\n"
                    ".unreq a_ptr3\n"
                    ".unreq a_ptr4\n"
                    ".unreq a_ptr5\n"
                    ".unreq c_ptr1\n"
                    ".unreq c_ptr2\n"
                    ".unreq c_ptr3\n"
                    ".unreq c_ptr4\n"
                    ".unreq c_ptr5\n"
                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [biasptr] "+r" (biasptr)
                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
                );
                break;
            case 14:
                __asm __volatile (
                    "a_ptr1 .req X0\n"
                    "a_ptr2 .req X1\n"
                    "a_ptr3 .req X2\n"
                    "a_ptr4 .req X3\n"
                    "a_ptr5 .req X4\n"
                    "c_ptr1 .req X5\n"
                    "c_ptr2 .req X6\n"
                    "c_ptr3 .req X7\n"
                    "c_ptr4 .req X8\n"
                    "c_ptr5 .req X9\n"
                    "add a_ptr1, %[a_ptr0], %[lda]\n"
                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
                    "add a_ptr2, a_ptr1, %[lda]\n"
                    "add c_ptr2, c_ptr1, %[ldc]\n"
                    "add a_ptr3, a_ptr2, %[lda]\n"
                    "add c_ptr3, c_ptr2, %[ldc]\n"
                    "add a_ptr4, a_ptr3, %[lda]\n"
                    "add c_ptr4, c_ptr3, %[ldc]\n"
                    "add a_ptr5, a_ptr4, %[lda]\n"
                    "add c_ptr5, c_ptr4, %[ldc]\n"
                    "cbz %[oob_rows], 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr5, %[c_ptr0], #0x0\n"
                    "add a_ptr5, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr4, %[c_ptr0], #0x0\n"
                    "add a_ptr4, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr3, %[c_ptr0], #0x0\n"
                    "add a_ptr3, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr2, %[c_ptr0], #0x0\n"
                    "add a_ptr2, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr1, %[c_ptr0], #0x0\n"
                    "add a_ptr1, %[a_ptr0], #0x0\n"
                    "1:\n"
                    "ldr q0, [%[a_ptr0]], #0x10\n"
                    "ldr q4, [a_ptr1], #0x10\n"
                    "ldr q8, [a_ptr2], #0x10\n"
                    "ldr q12, [a_ptr3], #0x10\n"
                    "ldr q16, [a_ptr4], #0x10\n"
                    "ldr q20, [a_ptr5], #0x10\n"
                    "ldr q1, [%[a_ptr0]], #0x10\n"
                    "ldr q5, [a_ptr1], #0x10\n"
                    "ldr q9, [a_ptr2], #0x10\n"
                    "ldr q13, [a_ptr3], #0x10\n"
                    "ldr q17, [a_ptr4], #0x10\n"
                    "ldr q21, [a_ptr5], #0x10\n"
                    "ldr q2, [%[a_ptr0]], #0x10\n"
                    "ldr q6, [a_ptr1], #0x10\n"
                    "ldr q10, [a_ptr2], #0x10\n"
                    "ldr q14, [a_ptr3], #0x10\n"
                    "ldr d3, [%[a_ptr0]]\n"
                    "ldr q18, [a_ptr4], #0x10\n"
                    "ldr d7, [a_ptr1]\n"
                    "ldr q22, [a_ptr5], #0x10\n"
                    "ldr d11, [a_ptr2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "ldr d15, [a_ptr3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "ldr d19, [a_ptr4]\n"
                    "ldr d23, [a_ptr5]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "cbz %[loops], 2f\n"
                    "ldr q26, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "mov v27.16b, v26.16b\n"
                    "mov v28.16b, v26.16b\n"
                    "mov v29.16b, v26.16b\n"
                    "mov v30.16b, v26.16b\n"
                    "mov v31.16b, v26.16b\n"
                    "fmla v26.4s, v24.4s, v0.s[0]\n"
                    "fmla v27.4s, v24.4s, v4.s[0]\n"
                    "fmla v28.4s, v24.4s, v8.s[0]\n"
                    "fmla v29.4s, v24.4s, v12.s[0]\n"
                    "fmla v30.4s, v24.4s, v16.s[0]\n"
                    "fmla v31.4s, v24.4s, v20.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[1]\n"
                    "fmla v27.4s, v25.4s, v4.s[1]\n"
                    "fmla v28.4s, v25.4s, v8.s[1]\n"
                    "fmla v29.4s, v25.4s, v12.s[1]\n"
                    "fmla v30.4s, v25.4s, v16.s[1]\n"
                    "fmla v31.4s, v25.4s, v20.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v0.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v8.s[2]\n"
                    "fmla v29.4s, v24.4s, v12.s[2]\n"
                    "fmla v30.4s, v24.4s, v16.s[2]\n"
                    "fmla v31.4s, v24.4s, v20.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v8.s[3]\n"
                    "fmla v29.4s, v25.4s, v12.s[3]\n"
                    "fmla v30.4s, v25.4s, v16.s[3]\n"
                    "fmla v31.4s, v25.4s, v20.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[0]\n"
                    "fmla v28.4s, v24.4s, v9.s[0]\n"
                    "fmla v29.4s, v24.4s, v13.s[0]\n"
                    "fmla v30.4s, v24.4s, v17.s[0]\n"
                    "fmla v31.4s, v24.4s, v21.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[1]\n"
                    "fmla v27.4s, v25.4s, v5.s[1]\n"
                    "fmla v28.4s, v25.4s, v9.s[1]\n"
                    "fmla v29.4s, v25.4s, v13.s[1]\n"
                    "fmla v30.4s, v25.4s, v17.s[1]\n"
                    "fmla v31.4s, v25.4s, v21.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[2]\n"
                    "fmla v28.4s, v24.4s, v9.s[2]\n"
                    "fmla v29.4s, v24.4s, v13.s[2]\n"
                    "fmla v30.4s, v24.4s, v17.s[2]\n"
                    "fmla v31.4s, v24.4s, v21.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v5.s[3]\n"
                    "fmla v28.4s, v25.4s, v9.s[3]\n"
                    "fmla v29.4s, v25.4s, v13.s[3]\n"
                    "fmla v30.4s, v25.4s, v17.s[3]\n"
                    "fmla v31.4s, v25.4s, v21.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[0]\n"
                    "fmla v28.4s, v24.4s, v10.s[0]\n"
                    "fmla v29.4s, v24.4s, v14.s[0]\n"
                    "fmla v30.4s, v24.4s, v18.s[0]\n"
                    "fmla v31.4s, v24.4s, v22.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[1]\n"
                    "fmla v27.4s, v25.4s, v6.s[1]\n"
                    "fmla v28.4s, v25.4s, v10.s[1]\n"
                    "fmla v29.4s, v25.4s, v14.s[1]\n"
                    "fmla v30.4s, v25.4s, v18.s[1]\n"
                    "fmla v31.4s, v25.4s, v22.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[2]\n"
                    "fmla v28.4s, v24.4s, v10.s[2]\n"
                    "fmla v29.4s, v24.4s, v14.s[2]\n"
                    "fmla v30.4s, v24.4s, v18.s[2]\n"
                    "fmla v31.4s, v24.4s, v22.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[3]\n"
                    "fmla v27.4s, v25.4s, v6.s[3]\n"
                    "fmla v28.4s, v25.4s, v10.s[3]\n"
                    "fmla v29.4s, v25.4s, v14.s[3]\n"
                    "fmla v30.4s, v25.4s, v18.s[3]\n"
                    "fmla v31.4s, v25.4s, v22.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v3.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v7.s[0]\n"
                    "fmla v28.4s, v24.4s, v11.s[0]\n"
                    "fmla v29.4s, v24.4s, v15.s[0]\n"
                    "fmla v30.4s, v24.4s, v19.s[0]\n"
                    "fmla v31.4s, v24.4s, v23.s[0]\n"
                    "fmla v26.4s, v25.4s, v3.s[1]\n"
                    "fmla v27.4s, v25.4s, v7.s[1]\n"
                    "fmla v28.4s, v25.4s, v11.s[1]\n"
                    "fmla v29.4s, v25.4s, v15.s[1]\n"
                    "fmla v30.4s, v25.4s, v19.s[1]\n"
                    "fmla v31.4s, v25.4s, v23.s[1]\n"
                    "b.eq 3f\n"
                    "4:\n"
                    "ld1r {v24.4s}, [%[minptr]]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "ld1r {v25.4s}, [%[maxptr]]\n"
                    "fmax v26.4s, v26.4s, v24.4s\n"
                    "fmax v27.4s, v27.4s, v24.4s\n"
                    "fmax v28.4s, v28.4s, v24.4s\n"
                    "fmax v29.4s, v29.4s, v24.4s\n"
                    "fmin v26.4s, v26.4s, v25.4s\n"
                    "fmin v27.4s, v27.4s, v25.4s\n"
                    "fmin v28.4s, v28.4s, v25.4s\n"
                    "fmin v29.4s, v29.4s, v25.4s\n"
                    "str q26, [%[c_ptr0]]\n"
                    "fmax v30.4s, v30.4s, v24.4s\n"
                    "ldr q26, [%[biasptr]]\n"
                    "fmax v31.4s, v31.4s, v24.4s\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "str q27, [c_ptr1]\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v30.4s, v30.4s, v25.4s\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmin v31.4s, v31.4s, v25.4s\n"
                    "str q28, [c_ptr2]\n"
                    "mov v27.16b, v26.16b\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "mov v28.16b, v26.16b\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "str q29, [c_ptr3]\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "mov v29.16b, v26.16b\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v4.s[0]\n"
                    "str q30, [c_ptr4]\n"
                    "mov v30.16b, v26.16b\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "fmla v28.4s, v24.4s, v8.s[0]\n"
                    "str q31, [c_ptr5]\n"
                    "mov v31.16b, v26.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "fmla v26.4s, v24.4s, v0.s[0]\n"
                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                    "fmla v29.4s, v24.4s, v12.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                    "fmla v30.4s, v24.4s, v16.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                    "fmla v31.4s, v24.4s, v20.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                    "fmla v27.4s, v25.4s, v4.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                    "fmla v28.4s, v25.4s, v8.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                    "fmla v29.4s, v25.4s, v12.s[1]\n"
                    "fmla v30.4s, v25.4s, v16.s[1]\n"
                    "fmla v31.4s, v25.4s, v20.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v0.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v8.s[2]\n"
                    "fmla v29.4s, v24.4s, v12.s[2]\n"
                    "fmla v30.4s, v24.4s, v16.s[2]\n"
                    "fmla v31.4s, v24.4s, v20.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v8.s[3]\n"
                    "fmla v29.4s, v25.4s, v12.s[3]\n"
                    "fmla v30.4s, v25.4s, v16.s[3]\n"
                    "fmla v31.4s, v25.4s, v20.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[0]\n"
                    "fmla v28.4s, v24.4s, v9.s[0]\n"
                    "fmla v29.4s, v24.4s, v13.s[0]\n"
                    "fmla v30.4s, v24.4s, v17.s[0]\n"
                    "fmla v31.4s, v24.4s, v21.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[1]\n"
                    "fmla v27.4s, v25.4s, v5.s[1]\n"
                    "fmla v28.4s, v25.4s, v9.s[1]\n"
                    "fmla v29.4s, v25.4s, v13.s[1]\n"
                    "fmla v30.4s, v25.4s, v17.s[1]\n"
                    "fmla v31.4s, v25.4s, v21.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[2]\n"
                    "fmla v28.4s, v24.4s, v9.s[2]\n"
                    "fmla v29.4s, v24.4s, v13.s[2]\n"
                    "fmla v30.4s, v24.4s, v17.s[2]\n"
                    "fmla v31.4s, v24.4s, v21.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v5.s[3]\n"
                    "fmla v28.4s, v25.4s, v9.s[3]\n"
                    "fmla v29.4s, v25.4s, v13.s[3]\n"
                    "fmla v30.4s, v25.4s, v17.s[3]\n"
                    "fmla v31.4s, v25.4s, v21.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[0]\n"
                    "fmla v28.4s, v24.4s, v10.s[0]\n"
                    "fmla v29.4s, v24.4s, v14.s[0]\n"
                    "fmla v30.4s, v24.4s, v18.s[0]\n"
                    "fmla v31.4s, v24.4s, v22.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[1]\n"
                    "fmla v27.4s, v25.4s, v6.s[1]\n"
                    "fmla v28.4s, v25.4s, v10.s[1]\n"
                    "fmla v29.4s, v25.4s, v14.s[1]\n"
                    "fmla v30.4s, v25.4s, v18.s[1]\n"
                    "fmla v31.4s, v25.4s, v22.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[2]\n"
                    "fmla v28.4s, v24.4s, v10.s[2]\n"
                    "fmla v29.4s, v24.4s, v14.s[2]\n"
                    "fmla v30.4s, v24.4s, v18.s[2]\n"
                    "fmla v31.4s, v24.4s, v22.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[3]\n"
                    "fmla v27.4s, v25.4s, v6.s[3]\n"
                    "fmla v28.4s, v25.4s, v10.s[3]\n"
                    "fmla v29.4s, v25.4s, v14.s[3]\n"
                    "fmla v30.4s, v25.4s, v18.s[3]\n"
                    "fmla v31.4s, v25.4s, v22.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v3.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v7.s[0]\n"
                    "fmla v28.4s, v24.4s, v11.s[0]\n"
                    "fmla v29.4s, v24.4s, v15.s[0]\n"
                    "fmla v30.4s, v24.4s, v19.s[0]\n"
                    "fmla v31.4s, v24.4s, v23.s[0]\n"
                    "fmla v26.4s, v25.4s, v3.s[1]\n"
                    "fmla v27.4s, v25.4s, v7.s[1]\n"
                    "fmla v28.4s, v25.4s, v11.s[1]\n"
                    "fmla v29.4s, v25.4s, v15.s[1]\n"
                    "fmla v30.4s, v25.4s, v19.s[1]\n"
                    "fmla v31.4s, v25.4s, v23.s[1]\n"
                    "b.ne 4b\n"
                    "3:\n"
                    "ld1r {v24.4s}, [%[minptr]]\n"
                    "ld1r {v25.4s}, [%[maxptr]]\n"
                    "fmax v26.4s, v26.4s, v24.4s\n"
                    "fmax v27.4s, v27.4s, v24.4s\n"
                    "fmax v28.4s, v28.4s, v24.4s\n"
                    "fmax v29.4s, v29.4s, v24.4s\n"
                    "fmin v26.4s, v26.4s, v25.4s\n"
                    "fmin v27.4s, v27.4s, v25.4s\n"
                    "fmin v28.4s, v28.4s, v25.4s\n"
                    "fmin v29.4s, v29.4s, v25.4s\n"
                    "str q26, [%[c_ptr0]]\n"
                    "fmax v30.4s, v30.4s, v24.4s\n"
                    "ldr q26, [%[biasptr]]\n"
                    "fmax v31.4s, v31.4s, v24.4s\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "str q27, [c_ptr1]\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v30.4s, v30.4s, v25.4s\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmin v31.4s, v31.4s, v25.4s\n"
                    "str q28, [c_ptr2]\n"
                    "mov v27.16b, v26.16b\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "mov v28.16b, v26.16b\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "str q29, [c_ptr3]\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "mov v29.16b, v26.16b\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v4.s[0]\n"
                    "str q30, [c_ptr4]\n"
                    "mov v30.16b, v26.16b\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "fmla v28.4s, v24.4s, v8.s[0]\n"
                    "str q31, [c_ptr5]\n"
                    "mov v31.16b, v26.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "fmla v26.4s, v24.4s, v0.s[0]\n"
                    "fmla v29.4s, v24.4s, v12.s[0]\n"
                    "fmla v30.4s, v24.4s, v16.s[0]\n"
                    "fmla v31.4s, v24.4s, v20.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[1]\n"
                    "fmla v27.4s, v25.4s, v4.s[1]\n"
                    "fmla v28.4s, v25.4s, v8.s[1]\n"
                    "fmla v29.4s, v25.4s, v12.s[1]\n"
                    "fmla v30.4s, v25.4s, v16.s[1]\n"
                    "fmla v31.4s, v25.4s, v20.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v0.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v8.s[2]\n"
                    "fmla v29.4s, v24.4s, v12.s[2]\n"
                    "fmla v30.4s, v24.4s, v16.s[2]\n"
                    "fmla v31.4s, v24.4s, v20.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v8.s[3]\n"
                    "fmla v29.4s, v25.4s, v12.s[3]\n"
                    "fmla v30.4s, v25.4s, v16.s[3]\n"
                    "fmla v31.4s, v25.4s, v20.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[0]\n"
                    "fmla v28.4s, v24.4s, v9.s[0]\n"
                    "fmla v29.4s, v24.4s, v13.s[0]\n"
                    "fmla v30.4s, v24.4s, v17.s[0]\n"
                    "fmla v31.4s, v24.4s, v21.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[1]\n"
                    "fmla v27.4s, v25.4s, v5.s[1]\n"
                    "fmla v28.4s, v25.4s, v9.s[1]\n"
                    "fmla v29.4s, v25.4s, v13.s[1]\n"
                    "fmla v30.4s, v25.4s, v17.s[1]\n"
                    "fmla v31.4s, v25.4s, v21.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[2]\n"
                    "fmla v28.4s, v24.4s, v9.s[2]\n"
                    "fmla v29.4s, v24.4s, v13.s[2]\n"
                    "fmla v30.4s, v24.4s, v17.s[2]\n"
                    "fmla v31.4s, v24.4s, v21.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v5.s[3]\n"
                    "fmla v28.4s, v25.4s, v9.s[3]\n"
                    "fmla v29.4s, v25.4s, v13.s[3]\n"
                    "fmla v30.4s, v25.4s, v17.s[3]\n"
                    "fmla v31.4s, v25.4s, v21.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[0]\n"
                    "fmla v28.4s, v24.4s, v10.s[0]\n"
                    "fmla v29.4s, v24.4s, v14.s[0]\n"
                    "fmla v30.4s, v24.4s, v18.s[0]\n"
                    "fmla v31.4s, v24.4s, v22.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[1]\n"
                    "fmla v27.4s, v25.4s, v6.s[1]\n"
                    "fmla v28.4s, v25.4s, v10.s[1]\n"
                    "fmla v29.4s, v25.4s, v14.s[1]\n"
                    "fmla v30.4s, v25.4s, v18.s[1]\n"
                    "fmla v31.4s, v25.4s, v22.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[2]\n"
                    "fmla v28.4s, v24.4s, v10.s[2]\n"
                    "fmla v29.4s, v24.4s, v14.s[2]\n"
                    "fmla v30.4s, v24.4s, v18.s[2]\n"
                    "fmla v31.4s, v24.4s, v22.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[3]\n"
                    "fmla v27.4s, v25.4s, v6.s[3]\n"
                    "fmla v28.4s, v25.4s, v10.s[3]\n"
                    "fmla v29.4s, v25.4s, v14.s[3]\n"
                    "fmla v30.4s, v25.4s, v18.s[3]\n"
                    "fmla v31.4s, v25.4s, v22.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v3.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v7.s[0]\n"
                    "fmla v28.4s, v24.4s, v11.s[0]\n"
                    "fmla v29.4s, v24.4s, v15.s[0]\n"
                    "fmla v30.4s, v24.4s, v19.s[0]\n"
                    "fmla v31.4s, v24.4s, v23.s[0]\n"
                    "fmla v26.4s, v25.4s, v3.s[1]\n"
                    "fmla v27.4s, v25.4s, v7.s[1]\n"
                    "fmla v28.4s, v25.4s, v11.s[1]\n"
                    "fmla v29.4s, v25.4s, v15.s[1]\n"
                    "fmla v30.4s, v25.4s, v19.s[1]\n"
                    "fmla v31.4s, v25.4s, v23.s[1]\n"
                    "b 5f\n"
                    "2:\n"
                    "ldr q26, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "mov v27.16b, v26.16b\n"
                    "mov v28.16b, v26.16b\n"
                    "mov v29.16b, v26.16b\n"
                    "mov v30.16b, v26.16b\n"
                    "mov v31.16b, v26.16b\n"
                    "fmla v26.4s, v24.4s, v0.s[0]\n"
                    "fmla v27.4s, v24.4s, v4.s[0]\n"
                    "fmla v28.4s, v24.4s, v8.s[0]\n"
                    "fmla v29.4s, v24.4s, v12.s[0]\n"
                    "fmla v30.4s, v24.4s, v16.s[0]\n"
                    "fmla v31.4s, v24.4s, v20.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[1]\n"
                    "fmla v27.4s, v25.4s, v4.s[1]\n"
                    "fmla v28.4s, v25.4s, v8.s[1]\n"
                    "fmla v29.4s, v25.4s, v12.s[1]\n"
                    "fmla v30.4s, v25.4s, v16.s[1]\n"
                    "fmla v31.4s, v25.4s, v20.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v0.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v8.s[2]\n"
                    "fmla v29.4s, v24.4s, v12.s[2]\n"
                    "fmla v30.4s, v24.4s, v16.s[2]\n"
                    "fmla v31.4s, v24.4s, v20.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v8.s[3]\n"
                    "fmla v29.4s, v25.4s, v12.s[3]\n"
                    "fmla v30.4s, v25.4s, v16.s[3]\n"
                    "fmla v31.4s, v25.4s, v20.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[0]\n"
                    "fmla v28.4s, v24.4s, v9.s[0]\n"
                    "fmla v29.4s, v24.4s, v13.s[0]\n"
                    "fmla v30.4s, v24.4s, v17.s[0]\n"
                    "fmla v31.4s, v24.4s, v21.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[1]\n"
                    "fmla v27.4s, v25.4s, v5.s[1]\n"
                    "fmla v28.4s, v25.4s, v9.s[1]\n"
                    "fmla v29.4s, v25.4s, v13.s[1]\n"
                    "fmla v30.4s, v25.4s, v17.s[1]\n"
                    "fmla v31.4s, v25.4s, v21.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[2]\n"
                    "fmla v28.4s, v24.4s, v9.s[2]\n"
                    "fmla v29.4s, v24.4s, v13.s[2]\n"
                    "fmla v30.4s, v24.4s, v17.s[2]\n"
                    "fmla v31.4s, v24.4s, v21.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v5.s[3]\n"
                    "fmla v28.4s, v25.4s, v9.s[3]\n"
                    "fmla v29.4s, v25.4s, v13.s[3]\n"
                    "fmla v30.4s, v25.4s, v17.s[3]\n"
                    "fmla v31.4s, v25.4s, v21.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[0]\n"
                    "fmla v28.4s, v24.4s, v10.s[0]\n"
                    "fmla v29.4s, v24.4s, v14.s[0]\n"
                    "fmla v30.4s, v24.4s, v18.s[0]\n"
                    "fmla v31.4s, v24.4s, v22.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[1]\n"
                    "fmla v27.4s, v25.4s, v6.s[1]\n"
                    "fmla v28.4s, v25.4s, v10.s[1]\n"
                    "fmla v29.4s, v25.4s, v14.s[1]\n"
                    "fmla v30.4s, v25.4s, v18.s[1]\n"
                    "fmla v31.4s, v25.4s, v22.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[2]\n"
                    "fmla v28.4s, v24.4s, v10.s[2]\n"
                    "fmla v29.4s, v24.4s, v14.s[2]\n"
                    "fmla v30.4s, v24.4s, v18.s[2]\n"
                    "fmla v31.4s, v24.4s, v22.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[3]\n"
                    "fmla v27.4s, v25.4s, v6.s[3]\n"
                    "fmla v28.4s, v25.4s, v10.s[3]\n"
                    "fmla v29.4s, v25.4s, v14.s[3]\n"
                    "fmla v30.4s, v25.4s, v18.s[3]\n"
                    "fmla v31.4s, v25.4s, v22.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v3.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v7.s[0]\n"
                    "fmla v28.4s, v24.4s, v11.s[0]\n"
                    "fmla v29.4s, v24.4s, v15.s[0]\n"
                    "fmla v30.4s, v24.4s, v19.s[0]\n"
                    "fmla v31.4s, v24.4s, v23.s[0]\n"
                    "fmla v26.4s, v25.4s, v3.s[1]\n"
                    "fmla v27.4s, v25.4s, v7.s[1]\n"
                    "fmla v28.4s, v25.4s, v11.s[1]\n"
                    "fmla v29.4s, v25.4s, v15.s[1]\n"
                    "fmla v30.4s, v25.4s, v19.s[1]\n"
                    "fmla v31.4s, v25.4s, v23.s[1]\n"
                    "5:\n"
                    "ld1r {v24.4s}, [%[minptr]]\n"
                    "ld1r {v25.4s}, [%[maxptr]]\n"
                    "fmax v26.4s, v26.4s, v24.4s\n"
                    "fmax v27.4s, v27.4s, v24.4s\n"
                    "fmax v28.4s, v28.4s, v24.4s\n"
                    "fmax v29.4s, v29.4s, v24.4s\n"
                    "fmin v26.4s, v26.4s, v25.4s\n"
                    "fmin v27.4s, v27.4s, v25.4s\n"
                    "fmin v28.4s, v28.4s, v25.4s\n"
                    "fmin v29.4s, v29.4s, v25.4s\n"
                    "str q26, [%[c_ptr0]]\n"
                    "fmax v30.4s, v30.4s, v24.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmax v31.4s, v31.4s, v24.4s\n"
                    "str q27, [c_ptr1]\n"
                    "fmin v30.4s, v30.4s, v25.4s\n"
                    "fmin v31.4s, v31.4s, v25.4s\n"
                    "str q28, [c_ptr2]\n"
                    "str q29, [c_ptr3]\n"
                    "str q30, [c_ptr4]\n"
                    "str q31, [c_ptr5]\n"
                    ".unreq a_ptr1\n"
                    ".unreq a_ptr2\n"
                    ".unreq a_ptr3\n"
                    ".unreq a_ptr4\n"
                    ".unreq a_ptr5\n"
                    ".unreq c_ptr1\n"
                    ".unreq c_ptr2\n"
                    ".unreq c_ptr3\n"
                    ".unreq c_ptr4\n"
                    ".unreq c_ptr5\n"
                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [biasptr] "+r" (biasptr)
                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
                );
                break;
            case 15:
                __asm __volatile (
                    "a_ptr1 .req X0\n"
                    "a_ptr2 .req X1\n"
                    "a_ptr3 .req X2\n"
                    "a_ptr4 .req X3\n"
                    "a_ptr5 .req X4\n"
                    "c_ptr1 .req X5\n"
                    "c_ptr2 .req X6\n"
                    "c_ptr3 .req X7\n"
                    "c_ptr4 .req X8\n"
                    "c_ptr5 .req X9\n"
                    "add a_ptr1, %[a_ptr0], %[lda]\n"
                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
                    "add a_ptr2, a_ptr1, %[lda]\n"
                    "add c_ptr2, c_ptr1, %[ldc]\n"
                    "add a_ptr3, a_ptr2, %[lda]\n"
                    "add c_ptr3, c_ptr2, %[ldc]\n"
                    "add a_ptr4, a_ptr3, %[lda]\n"
                    "add c_ptr4, c_ptr3, %[ldc]\n"
                    "add a_ptr5, a_ptr4, %[lda]\n"
                    "add c_ptr5, c_ptr4, %[ldc]\n"
                    "cbz %[oob_rows], 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr5, %[c_ptr0], #0x0\n"
                    "add a_ptr5, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr4, %[c_ptr0], #0x0\n"
                    "add a_ptr4, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr3, %[c_ptr0], #0x0\n"
                    "add a_ptr3, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr2, %[c_ptr0], #0x0\n"
                    "add a_ptr2, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr1, %[c_ptr0], #0x0\n"
                    "add a_ptr1, %[a_ptr0], #0x0\n"
                    "1:\n"
                    "ldr q0, [%[a_ptr0]], #0x10\n"
                    "ldr q4, [a_ptr1], #0x10\n"
                    "ldr q8, [a_ptr2], #0x10\n"
                    "ldr q12, [a_ptr3], #0x10\n"
                    "ldr q16, [a_ptr4], #0x10\n"
                    "ldr q20, [a_ptr5], #0x10\n"
                    "ldr q1, [%[a_ptr0]], #0x10\n"
                    "ldr q5, [a_ptr1], #0x10\n"
                    "ldr q9, [a_ptr2], #0x10\n"
                    "ldr q13, [a_ptr3], #0x10\n"
                    "ldr q17, [a_ptr4], #0x10\n"
                    "ldr q21, [a_ptr5], #0x10\n"
                    "ldr q2, [%[a_ptr0]], #0x10\n"
                    "ldr q6, [a_ptr1], #0x10\n"
                    "ldr q10, [a_ptr2], #0x10\n"
                    "ldr q14, [a_ptr3], #0x10\n"
                    "ldr d3, [%[a_ptr0]], #0x8\n"
                    "ldr q18, [a_ptr4], #0x10\n"
                    "ldr d7, [a_ptr1], #0x8\n"
                    "ldr q22, [a_ptr5], #0x10\n"
                    "ldr d11, [a_ptr2], #0x8\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "ldr d15, [a_ptr3], #0x8\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "ldr d19, [a_ptr4], #0x8\n"
                    "ldr d23, [a_ptr5], #0x8\n"
                    "ld1 {v3.s}[2], [%[a_ptr0]]\n"
                    "ld1 {v7.s}[2], [a_ptr1]\n"
                    "ld1 {v11.s}[2], [a_ptr2]\n"
                    "ld1 {v15.s}[2], [a_ptr3]\n"
                    "ld1 {v19.s}[2], [a_ptr4]\n"
                    "ld1 {v23.s}[2], [a_ptr5]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "cbz %[loops], 2f\n"
                    "ldr q26, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "mov v27.16b, v26.16b\n"
                    "mov v28.16b, v26.16b\n"
                    "mov v29.16b, v26.16b\n"
                    "mov v30.16b, v26.16b\n"
                    "mov v31.16b, v26.16b\n"
                    "fmla v26.4s, v24.4s, v0.s[0]\n"
                    "fmla v27.4s, v24.4s, v4.s[0]\n"
                    "fmla v28.4s, v24.4s, v8.s[0]\n"
                    "fmla v29.4s, v24.4s, v12.s[0]\n"
                    "fmla v30.4s, v24.4s, v16.s[0]\n"
                    "fmla v31.4s, v24.4s, v20.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[1]\n"
                    "fmla v27.4s, v25.4s, v4.s[1]\n"
                    "fmla v28.4s, v25.4s, v8.s[1]\n"
                    "fmla v29.4s, v25.4s, v12.s[1]\n"
                    "fmla v30.4s, v25.4s, v16.s[1]\n"
                    "fmla v31.4s, v25.4s, v20.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v0.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v8.s[2]\n"
                    "fmla v29.4s, v24.4s, v12.s[2]\n"
                    "fmla v30.4s, v24.4s, v16.s[2]\n"
                    "fmla v31.4s, v24.4s, v20.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v8.s[3]\n"
                    "fmla v29.4s, v25.4s, v12.s[3]\n"
                    "fmla v30.4s, v25.4s, v16.s[3]\n"
                    "fmla v31.4s, v25.4s, v20.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[0]\n"
                    "fmla v28.4s, v24.4s, v9.s[0]\n"
                    "fmla v29.4s, v24.4s, v13.s[0]\n"
                    "fmla v30.4s, v24.4s, v17.s[0]\n"
                    "fmla v31.4s, v24.4s, v21.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[1]\n"
                    "fmla v27.4s, v25.4s, v5.s[1]\n"
                    "fmla v28.4s, v25.4s, v9.s[1]\n"
                    "fmla v29.4s, v25.4s, v13.s[1]\n"
                    "fmla v30.4s, v25.4s, v17.s[1]\n"
                    "fmla v31.4s, v25.4s, v21.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[2]\n"
                    "fmla v28.4s, v24.4s, v9.s[2]\n"
                    "fmla v29.4s, v24.4s, v13.s[2]\n"
                    "fmla v30.4s, v24.4s, v17.s[2]\n"
                    "fmla v31.4s, v24.4s, v21.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v5.s[3]\n"
                    "fmla v28.4s, v25.4s, v9.s[3]\n"
                    "fmla v29.4s, v25.4s, v13.s[3]\n"
                    "fmla v30.4s, v25.4s, v17.s[3]\n"
                    "fmla v31.4s, v25.4s, v21.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[0]\n"
                    "fmla v28.4s, v24.4s, v10.s[0]\n"
                    "fmla v29.4s, v24.4s, v14.s[0]\n"
                    "fmla v30.4s, v24.4s, v18.s[0]\n"
                    "fmla v31.4s, v24.4s, v22.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[1]\n"
                    "fmla v27.4s, v25.4s, v6.s[1]\n"
                    "fmla v28.4s, v25.4s, v10.s[1]\n"
                    "fmla v29.4s, v25.4s, v14.s[1]\n"
                    "fmla v30.4s, v25.4s, v18.s[1]\n"
                    "fmla v31.4s, v25.4s, v22.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[2]\n"
                    "fmla v28.4s, v24.4s, v10.s[2]\n"
                    "fmla v29.4s, v24.4s, v14.s[2]\n"
                    "fmla v30.4s, v24.4s, v18.s[2]\n"
                    "fmla v31.4s, v24.4s, v22.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[3]\n"
                    "fmla v27.4s, v25.4s, v6.s[3]\n"
                    "fmla v28.4s, v25.4s, v10.s[3]\n"
                    "fmla v29.4s, v25.4s, v14.s[3]\n"
                    "fmla v30.4s, v25.4s, v18.s[3]\n"
                    "fmla v31.4s, v25.4s, v22.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v3.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v7.s[0]\n"
                    "fmla v28.4s, v24.4s, v11.s[0]\n"
                    "fmla v29.4s, v24.4s, v15.s[0]\n"
                    "fmla v30.4s, v24.4s, v19.s[0]\n"
                    "fmla v31.4s, v24.4s, v23.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v3.s[1]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                    "fmla v27.4s, v25.4s, v7.s[1]\n"
                    "fmla v28.4s, v25.4s, v11.s[1]\n"
                    "fmla v29.4s, v25.4s, v15.s[1]\n"
                    "fmla v30.4s, v25.4s, v19.s[1]\n"
                    "fmla v31.4s, v25.4s, v23.s[1]\n"
                    "fmla v26.4s, v24.4s, v3.s[2]\n"
                    "fmla v27.4s, v24.4s, v7.s[2]\n"
                    "fmla v28.4s, v24.4s, v11.s[2]\n"
                    "fmla v29.4s, v24.4s, v15.s[2]\n"
                    "fmla v30.4s, v24.4s, v19.s[2]\n"
                    "fmla v31.4s, v24.4s, v23.s[2]\n"
                    "b.eq 3f\n"
                    "4:\n"
                    "ld1r {v24.4s}, [%[minptr]]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "ld1r {v25.4s}, [%[maxptr]]\n"
                    "fmax v26.4s, v26.4s, v24.4s\n"
                    "fmax v27.4s, v27.4s, v24.4s\n"
                    "fmax v28.4s, v28.4s, v24.4s\n"
                    "fmax v29.4s, v29.4s, v24.4s\n"
                    "fmin v26.4s, v26.4s, v25.4s\n"
                    "fmin v27.4s, v27.4s, v25.4s\n"
                    "fmin v28.4s, v28.4s, v25.4s\n"
                    "fmin v29.4s, v29.4s, v25.4s\n"
                    "str q26, [%[c_ptr0]]\n"
                    "fmax v30.4s, v30.4s, v24.4s\n"
                    "ldr q26, [%[biasptr]]\n"
                    "fmax v31.4s, v31.4s, v24.4s\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "str q27, [c_ptr1]\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v30.4s, v30.4s, v25.4s\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmin v31.4s, v31.4s, v25.4s\n"
                    "str q28, [c_ptr2]\n"
                    "mov v27.16b, v26.16b\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "mov v28.16b, v26.16b\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "str q29, [c_ptr3]\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "mov v29.16b, v26.16b\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v4.s[0]\n"
                    "str q30, [c_ptr4]\n"
                    "mov v30.16b, v26.16b\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "fmla v28.4s, v24.4s, v8.s[0]\n"
                    "str q31, [c_ptr5]\n"
                    "mov v31.16b, v26.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "fmla v26.4s, v24.4s, v0.s[0]\n"
                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                    "fmla v29.4s, v24.4s, v12.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                    "fmla v30.4s, v24.4s, v16.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                    "fmla v31.4s, v24.4s, v20.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                    "fmla v27.4s, v25.4s, v4.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                    "fmla v28.4s, v25.4s, v8.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                    "fmla v29.4s, v25.4s, v12.s[1]\n"
                    "fmla v30.4s, v25.4s, v16.s[1]\n"
                    "fmla v31.4s, v25.4s, v20.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v0.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v8.s[2]\n"
                    "fmla v29.4s, v24.4s, v12.s[2]\n"
                    "fmla v30.4s, v24.4s, v16.s[2]\n"
                    "fmla v31.4s, v24.4s, v20.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v8.s[3]\n"
                    "fmla v29.4s, v25.4s, v12.s[3]\n"
                    "fmla v30.4s, v25.4s, v16.s[3]\n"
                    "fmla v31.4s, v25.4s, v20.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[0]\n"
                    "fmla v28.4s, v24.4s, v9.s[0]\n"
                    "fmla v29.4s, v24.4s, v13.s[0]\n"
                    "fmla v30.4s, v24.4s, v17.s[0]\n"
                    "fmla v31.4s, v24.4s, v21.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[1]\n"
                    "fmla v27.4s, v25.4s, v5.s[1]\n"
                    "fmla v28.4s, v25.4s, v9.s[1]\n"
                    "fmla v29.4s, v25.4s, v13.s[1]\n"
                    "fmla v30.4s, v25.4s, v17.s[1]\n"
                    "fmla v31.4s, v25.4s, v21.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[2]\n"
                    "fmla v28.4s, v24.4s, v9.s[2]\n"
                    "fmla v29.4s, v24.4s, v13.s[2]\n"
                    "fmla v30.4s, v24.4s, v17.s[2]\n"
                    "fmla v31.4s, v24.4s, v21.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v5.s[3]\n"
                    "fmla v28.4s, v25.4s, v9.s[3]\n"
                    "fmla v29.4s, v25.4s, v13.s[3]\n"
                    "fmla v30.4s, v25.4s, v17.s[3]\n"
                    "fmla v31.4s, v25.4s, v21.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[0]\n"
                    "fmla v28.4s, v24.4s, v10.s[0]\n"
                    "fmla v29.4s, v24.4s, v14.s[0]\n"
                    "fmla v30.4s, v24.4s, v18.s[0]\n"
                    "fmla v31.4s, v24.4s, v22.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[1]\n"
                    "fmla v27.4s, v25.4s, v6.s[1]\n"
                    "fmla v28.4s, v25.4s, v10.s[1]\n"
                    "fmla v29.4s, v25.4s, v14.s[1]\n"
                    "fmla v30.4s, v25.4s, v18.s[1]\n"
                    "fmla v31.4s, v25.4s, v22.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[2]\n"
                    "fmla v28.4s, v24.4s, v10.s[2]\n"
                    "fmla v29.4s, v24.4s, v14.s[2]\n"
                    "fmla v30.4s, v24.4s, v18.s[2]\n"
                    "fmla v31.4s, v24.4s, v22.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[3]\n"
                    "fmla v27.4s, v25.4s, v6.s[3]\n"
                    "fmla v28.4s, v25.4s, v10.s[3]\n"
                    "fmla v29.4s, v25.4s, v14.s[3]\n"
                    "fmla v30.4s, v25.4s, v18.s[3]\n"
                    "fmla v31.4s, v25.4s, v22.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v3.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v7.s[0]\n"
                    "fmla v28.4s, v24.4s, v11.s[0]\n"
                    "fmla v29.4s, v24.4s, v15.s[0]\n"
                    "fmla v30.4s, v24.4s, v19.s[0]\n"
                    "fmla v31.4s, v24.4s, v23.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v3.s[1]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                    "fmla v27.4s, v25.4s, v7.s[1]\n"
                    "fmla v28.4s, v25.4s, v11.s[1]\n"
                    "fmla v29.4s, v25.4s, v15.s[1]\n"
                    "fmla v30.4s, v25.4s, v19.s[1]\n"
                    "fmla v31.4s, v25.4s, v23.s[1]\n"
                    "fmla v26.4s, v24.4s, v3.s[2]\n"
                    "fmla v27.4s, v24.4s, v7.s[2]\n"
                    "fmla v28.4s, v24.4s, v11.s[2]\n"
                    "fmla v29.4s, v24.4s, v15.s[2]\n"
                    "fmla v30.4s, v24.4s, v19.s[2]\n"
                    "fmla v31.4s, v24.4s, v23.s[2]\n"
                    "b.ne 4b\n"
                    "3:\n"
                    "ld1r {v24.4s}, [%[minptr]]\n"
                    "ld1r {v25.4s}, [%[maxptr]]\n"
                    "fmax v26.4s, v26.4s, v24.4s\n"
                    "fmax v27.4s, v27.4s, v24.4s\n"
                    "fmax v28.4s, v28.4s, v24.4s\n"
                    "fmax v29.4s, v29.4s, v24.4s\n"
                    "fmin v26.4s, v26.4s, v25.4s\n"
                    "fmin v27.4s, v27.4s, v25.4s\n"
                    "fmin v28.4s, v28.4s, v25.4s\n"
                    "fmin v29.4s, v29.4s, v25.4s\n"
                    "str q26, [%[c_ptr0]]\n"
                    "fmax v30.4s, v30.4s, v24.4s\n"
                    "ldr q26, [%[biasptr]]\n"
                    "fmax v31.4s, v31.4s, v24.4s\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "str q27, [c_ptr1]\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v30.4s, v30.4s, v25.4s\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmin v31.4s, v31.4s, v25.4s\n"
                    "str q28, [c_ptr2]\n"
                    "mov v27.16b, v26.16b\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "mov v28.16b, v26.16b\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "str q29, [c_ptr3]\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "mov v29.16b, v26.16b\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v4.s[0]\n"
                    "str q30, [c_ptr4]\n"
                    "mov v30.16b, v26.16b\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "fmla v28.4s, v24.4s, v8.s[0]\n"
                    "str q31, [c_ptr5]\n"
                    "mov v31.16b, v26.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "fmla v26.4s, v24.4s, v0.s[0]\n"
                    "fmla v29.4s, v24.4s, v12.s[0]\n"
                    "fmla v30.4s, v24.4s, v16.s[0]\n"
                    "fmla v31.4s, v24.4s, v20.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[1]\n"
                    "fmla v27.4s, v25.4s, v4.s[1]\n"
                    "fmla v28.4s, v25.4s, v8.s[1]\n"
                    "fmla v29.4s, v25.4s, v12.s[1]\n"
                    "fmla v30.4s, v25.4s, v16.s[1]\n"
                    "fmla v31.4s, v25.4s, v20.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v0.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v8.s[2]\n"
                    "fmla v29.4s, v24.4s, v12.s[2]\n"
                    "fmla v30.4s, v24.4s, v16.s[2]\n"
                    "fmla v31.4s, v24.4s, v20.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v8.s[3]\n"
                    "fmla v29.4s, v25.4s, v12.s[3]\n"
                    "fmla v30.4s, v25.4s, v16.s[3]\n"
                    "fmla v31.4s, v25.4s, v20.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[0]\n"
                    "fmla v28.4s, v24.4s, v9.s[0]\n"
                    "fmla v29.4s, v24.4s, v13.s[0]\n"
                    "fmla v30.4s, v24.4s, v17.s[0]\n"
                    "fmla v31.4s, v24.4s, v21.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[1]\n"
                    "fmla v27.4s, v25.4s, v5.s[1]\n"
                    "fmla v28.4s, v25.4s, v9.s[1]\n"
                    "fmla v29.4s, v25.4s, v13.s[1]\n"
                    "fmla v30.4s, v25.4s, v17.s[1]\n"
                    "fmla v31.4s, v25.4s, v21.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[2]\n"
                    "fmla v28.4s, v24.4s, v9.s[2]\n"
                    "fmla v29.4s, v24.4s, v13.s[2]\n"
                    "fmla v30.4s, v24.4s, v17.s[2]\n"
                    "fmla v31.4s, v24.4s, v21.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v5.s[3]\n"
                    "fmla v28.4s, v25.4s, v9.s[3]\n"
                    "fmla v29.4s, v25.4s, v13.s[3]\n"
                    "fmla v30.4s, v25.4s, v17.s[3]\n"
                    "fmla v31.4s, v25.4s, v21.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[0]\n"
                    "fmla v28.4s, v24.4s, v10.s[0]\n"
                    "fmla v29.4s, v24.4s, v14.s[0]\n"
                    "fmla v30.4s, v24.4s, v18.s[0]\n"
                    "fmla v31.4s, v24.4s, v22.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[1]\n"
                    "fmla v27.4s, v25.4s, v6.s[1]\n"
                    "fmla v28.4s, v25.4s, v10.s[1]\n"
                    "fmla v29.4s, v25.4s, v14.s[1]\n"
                    "fmla v30.4s, v25.4s, v18.s[1]\n"
                    "fmla v31.4s, v25.4s, v22.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[2]\n"
                    "fmla v28.4s, v24.4s, v10.s[2]\n"
                    "fmla v29.4s, v24.4s, v14.s[2]\n"
                    "fmla v30.4s, v24.4s, v18.s[2]\n"
                    "fmla v31.4s, v24.4s, v22.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[3]\n"
                    "fmla v27.4s, v25.4s, v6.s[3]\n"
                    "fmla v28.4s, v25.4s, v10.s[3]\n"
                    "fmla v29.4s, v25.4s, v14.s[3]\n"
                    "fmla v30.4s, v25.4s, v18.s[3]\n"
                    "fmla v31.4s, v25.4s, v22.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v3.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v7.s[0]\n"
                    "fmla v28.4s, v24.4s, v11.s[0]\n"
                    "fmla v29.4s, v24.4s, v15.s[0]\n"
                    "fmla v30.4s, v24.4s, v19.s[0]\n"
                    "fmla v31.4s, v24.4s, v23.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v3.s[1]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                    "fmla v27.4s, v25.4s, v7.s[1]\n"
                    "fmla v28.4s, v25.4s, v11.s[1]\n"
                    "fmla v29.4s, v25.4s, v15.s[1]\n"
                    "fmla v30.4s, v25.4s, v19.s[1]\n"
                    "fmla v31.4s, v25.4s, v23.s[1]\n"
                    "fmla v26.4s, v24.4s, v3.s[2]\n"
                    "fmla v27.4s, v24.4s, v7.s[2]\n"
                    "fmla v28.4s, v24.4s, v11.s[2]\n"
                    "fmla v29.4s, v24.4s, v15.s[2]\n"
                    "fmla v30.4s, v24.4s, v19.s[2]\n"
                    "fmla v31.4s, v24.4s, v23.s[2]\n"
                    "b 5f\n"
                    "2:\n"
                    "ldr q26, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "mov v27.16b, v26.16b\n"
                    "mov v28.16b, v26.16b\n"
                    "mov v29.16b, v26.16b\n"
                    "mov v30.16b, v26.16b\n"
                    "mov v31.16b, v26.16b\n"
                    "fmla v26.4s, v24.4s, v0.s[0]\n"
                    "fmla v27.4s, v24.4s, v4.s[0]\n"
                    "fmla v28.4s, v24.4s, v8.s[0]\n"
                    "fmla v29.4s, v24.4s, v12.s[0]\n"
                    "fmla v30.4s, v24.4s, v16.s[0]\n"
                    "fmla v31.4s, v24.4s, v20.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[1]\n"
                    "fmla v27.4s, v25.4s, v4.s[1]\n"
                    "fmla v28.4s, v25.4s, v8.s[1]\n"
                    "fmla v29.4s, v25.4s, v12.s[1]\n"
                    "fmla v30.4s, v25.4s, v16.s[1]\n"
                    "fmla v31.4s, v25.4s, v20.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v0.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v8.s[2]\n"
                    "fmla v29.4s, v24.4s, v12.s[2]\n"
                    "fmla v30.4s, v24.4s, v16.s[2]\n"
                    "fmla v31.4s, v24.4s, v20.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v8.s[3]\n"
                    "fmla v29.4s, v25.4s, v12.s[3]\n"
                    "fmla v30.4s, v25.4s, v16.s[3]\n"
                    "fmla v31.4s, v25.4s, v20.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[0]\n"
                    "fmla v28.4s, v24.4s, v9.s[0]\n"
                    "fmla v29.4s, v24.4s, v13.s[0]\n"
                    "fmla v30.4s, v24.4s, v17.s[0]\n"
                    "fmla v31.4s, v24.4s, v21.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[1]\n"
                    "fmla v27.4s, v25.4s, v5.s[1]\n"
                    "fmla v28.4s, v25.4s, v9.s[1]\n"
                    "fmla v29.4s, v25.4s, v13.s[1]\n"
                    "fmla v30.4s, v25.4s, v17.s[1]\n"
                    "fmla v31.4s, v25.4s, v21.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[2]\n"
                    "fmla v28.4s, v24.4s, v9.s[2]\n"
                    "fmla v29.4s, v24.4s, v13.s[2]\n"
                    "fmla v30.4s, v24.4s, v17.s[2]\n"
                    "fmla v31.4s, v24.4s, v21.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v5.s[3]\n"
                    "fmla v28.4s, v25.4s, v9.s[3]\n"
                    "fmla v29.4s, v25.4s, v13.s[3]\n"
                    "fmla v30.4s, v25.4s, v17.s[3]\n"
                    "fmla v31.4s, v25.4s, v21.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[0]\n"
                    "fmla v28.4s, v24.4s, v10.s[0]\n"
                    "fmla v29.4s, v24.4s, v14.s[0]\n"
                    "fmla v30.4s, v24.4s, v18.s[0]\n"
                    "fmla v31.4s, v24.4s, v22.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[1]\n"
                    "fmla v27.4s, v25.4s, v6.s[1]\n"
                    "fmla v28.4s, v25.4s, v10.s[1]\n"
                    "fmla v29.4s, v25.4s, v14.s[1]\n"
                    "fmla v30.4s, v25.4s, v18.s[1]\n"
                    "fmla v31.4s, v25.4s, v22.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[2]\n"
                    "fmla v28.4s, v24.4s, v10.s[2]\n"
                    "fmla v29.4s, v24.4s, v14.s[2]\n"
                    "fmla v30.4s, v24.4s, v18.s[2]\n"
                    "fmla v31.4s, v24.4s, v22.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[3]\n"
                    "fmla v27.4s, v25.4s, v6.s[3]\n"
                    "fmla v28.4s, v25.4s, v10.s[3]\n"
                    "fmla v29.4s, v25.4s, v14.s[3]\n"
                    "fmla v30.4s, v25.4s, v18.s[3]\n"
                    "fmla v31.4s, v25.4s, v22.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v3.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v7.s[0]\n"
                    "fmla v28.4s, v24.4s, v11.s[0]\n"
                    "fmla v29.4s, v24.4s, v15.s[0]\n"
                    "fmla v30.4s, v24.4s, v19.s[0]\n"
                    "fmla v31.4s, v24.4s, v23.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v3.s[1]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                    "fmla v27.4s, v25.4s, v7.s[1]\n"
                    "fmla v28.4s, v25.4s, v11.s[1]\n"
                    "fmla v29.4s, v25.4s, v15.s[1]\n"
                    "fmla v30.4s, v25.4s, v19.s[1]\n"
                    "fmla v31.4s, v25.4s, v23.s[1]\n"
                    "fmla v26.4s, v24.4s, v3.s[2]\n"
                    "fmla v27.4s, v24.4s, v7.s[2]\n"
                    "fmla v28.4s, v24.4s, v11.s[2]\n"
                    "fmla v29.4s, v24.4s, v15.s[2]\n"
                    "fmla v30.4s, v24.4s, v19.s[2]\n"
                    "fmla v31.4s, v24.4s, v23.s[2]\n"
                    "5:\n"
                    "ld1r {v24.4s}, [%[minptr]]\n"
                    "ld1r {v25.4s}, [%[maxptr]]\n"
                    "fmax v26.4s, v26.4s, v24.4s\n"
                    "fmax v27.4s, v27.4s, v24.4s\n"
                    "fmax v28.4s, v28.4s, v24.4s\n"
                    "fmax v29.4s, v29.4s, v24.4s\n"
                    "fmin v26.4s, v26.4s, v25.4s\n"
                    "fmin v27.4s, v27.4s, v25.4s\n"
                    "fmin v28.4s, v28.4s, v25.4s\n"
                    "fmin v29.4s, v29.4s, v25.4s\n"
                    "str q26, [%[c_ptr0]]\n"
                    "fmax v30.4s, v30.4s, v24.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmax v31.4s, v31.4s, v24.4s\n"
                    "str q27, [c_ptr1]\n"
                    "fmin v30.4s, v30.4s, v25.4s\n"
                    "fmin v31.4s, v31.4s, v25.4s\n"
                    "str q28, [c_ptr2]\n"
                    "str q29, [c_ptr3]\n"
                    "str q30, [c_ptr4]\n"
                    "str q31, [c_ptr5]\n"
                    ".unreq a_ptr1\n"
                    ".unreq a_ptr2\n"
                    ".unreq a_ptr3\n"
                    ".unreq a_ptr4\n"
                    ".unreq a_ptr5\n"
                    ".unreq c_ptr1\n"
                    ".unreq c_ptr2\n"
                    ".unreq c_ptr3\n"
                    ".unreq c_ptr4\n"
                    ".unreq c_ptr5\n"
                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [biasptr] "+r" (biasptr)
                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
                );
                break;
            default:
            case 16:
                __asm __volatile (
                    "a_ptr1 .req X0\n"
                    "a_ptr2 .req X1\n"
                    "a_ptr3 .req X2\n"
                    "a_ptr4 .req X3\n"
                    "a_ptr5 .req X4\n"
                    "c_ptr1 .req X5\n"
                    "c_ptr2 .req X6\n"
                    "c_ptr3 .req X7\n"
                    "c_ptr4 .req X8\n"
                    "c_ptr5 .req X9\n"
                    "add a_ptr1, %[a_ptr0], %[lda]\n"
                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
                    "add a_ptr2, a_ptr1, %[lda]\n"
                    "add c_ptr2, c_ptr1, %[ldc]\n"
                    "add a_ptr3, a_ptr2, %[lda]\n"
                    "add c_ptr3, c_ptr2, %[ldc]\n"
                    "add a_ptr4, a_ptr3, %[lda]\n"
                    "add c_ptr4, c_ptr3, %[ldc]\n"
                    "add a_ptr5, a_ptr4, %[lda]\n"
                    "add c_ptr5, c_ptr4, %[ldc]\n"
                    "cbz %[oob_rows], 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr5, %[c_ptr0], #0x0\n"
                    "add a_ptr5, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr4, %[c_ptr0], #0x0\n"
                    "add a_ptr4, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr3, %[c_ptr0], #0x0\n"
                    "add a_ptr3, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr2, %[c_ptr0], #0x0\n"
                    "add a_ptr2, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr1, %[c_ptr0], #0x0\n"
                    "add a_ptr1, %[a_ptr0], #0x0\n"
                    "1:\n"
                    "ldr q0, [%[a_ptr0]], #0x10\n"
                    "ldr q4, [a_ptr1], #0x10\n"
                    "ldr q8, [a_ptr2], #0x10\n"
                    "ldr q12, [a_ptr3], #0x10\n"
                    "ldr q16, [a_ptr4], #0x10\n"
                    "ldr q20, [a_ptr5], #0x10\n"
                    "ldr q1, [%[a_ptr0]], #0x10\n"
                    "ldr q5, [a_ptr1], #0x10\n"
                    "ldr q9, [a_ptr2], #0x10\n"
                    "ldr q13, [a_ptr3], #0x10\n"
                    "ldr q17, [a_ptr4], #0x10\n"
                    "ldr q21, [a_ptr5], #0x10\n"
                    "ldr q2, [%[a_ptr0]], #0x10\n"
                    "ldr q6, [a_ptr1], #0x10\n"
                    "ldr q10, [a_ptr2], #0x10\n"
                    "ldr q14, [a_ptr3], #0x10\n"
                    "ldr q18, [a_ptr4], #0x10\n"
                    "ldr q22, [a_ptr5], #0x10\n"
                    "ldr q3, [%[a_ptr0]]\n"
                    "ldr q7, [a_ptr1]\n"
                    "ldr q11, [a_ptr2]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
                    "ldr q15, [a_ptr3]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
                    "ldr q19, [a_ptr4]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
                    "ldr q23, [a_ptr5]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "cbz %[loops], 2f\n"
                    "ldr q26, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "mov v27.16b, v26.16b\n"
                    "mov v28.16b, v26.16b\n"
                    "mov v29.16b, v26.16b\n"
                    "mov v30.16b, v26.16b\n"
                    "mov v31.16b, v26.16b\n"
                    "fmla v26.4s, v24.4s, v0.s[0]\n"
                    "fmla v27.4s, v24.4s, v4.s[0]\n"
                    "fmla v28.4s, v24.4s, v8.s[0]\n"
                    "fmla v29.4s, v24.4s, v12.s[0]\n"
                    "fmla v30.4s, v24.4s, v16.s[0]\n"
                    "fmla v31.4s, v24.4s, v20.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[1]\n"
                    "fmla v27.4s, v25.4s, v4.s[1]\n"
                    "fmla v28.4s, v25.4s, v8.s[1]\n"
                    "fmla v29.4s, v25.4s, v12.s[1]\n"
                    "fmla v30.4s, v25.4s, v16.s[1]\n"
                    "fmla v31.4s, v25.4s, v20.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v0.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v8.s[2]\n"
                    "fmla v29.4s, v24.4s, v12.s[2]\n"
                    "fmla v30.4s, v24.4s, v16.s[2]\n"
                    "fmla v31.4s, v24.4s, v20.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v8.s[3]\n"
                    "fmla v29.4s, v25.4s, v12.s[3]\n"
                    "fmla v30.4s, v25.4s, v16.s[3]\n"
                    "fmla v31.4s, v25.4s, v20.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[0]\n"
                    "fmla v28.4s, v24.4s, v9.s[0]\n"
                    "fmla v29.4s, v24.4s, v13.s[0]\n"
                    "fmla v30.4s, v24.4s, v17.s[0]\n"
                    "fmla v31.4s, v24.4s, v21.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[1]\n"
                    "fmla v27.4s, v25.4s, v5.s[1]\n"
                    "fmla v28.4s, v25.4s, v9.s[1]\n"
                    "fmla v29.4s, v25.4s, v13.s[1]\n"
                    "fmla v30.4s, v25.4s, v17.s[1]\n"
                    "fmla v31.4s, v25.4s, v21.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[2]\n"
                    "fmla v28.4s, v24.4s, v9.s[2]\n"
                    "fmla v29.4s, v24.4s, v13.s[2]\n"
                    "fmla v30.4s, v24.4s, v17.s[2]\n"
                    "fmla v31.4s, v24.4s, v21.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v5.s[3]\n"
                    "fmla v28.4s, v25.4s, v9.s[3]\n"
                    "fmla v29.4s, v25.4s, v13.s[3]\n"
                    "fmla v30.4s, v25.4s, v17.s[3]\n"
                    "fmla v31.4s, v25.4s, v21.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[0]\n"
                    "fmla v28.4s, v24.4s, v10.s[0]\n"
                    "fmla v29.4s, v24.4s, v14.s[0]\n"
                    "fmla v30.4s, v24.4s, v18.s[0]\n"
                    "fmla v31.4s, v24.4s, v22.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[1]\n"
                    "fmla v27.4s, v25.4s, v6.s[1]\n"
                    "fmla v28.4s, v25.4s, v10.s[1]\n"
                    "fmla v29.4s, v25.4s, v14.s[1]\n"
                    "fmla v30.4s, v25.4s, v18.s[1]\n"
                    "fmla v31.4s, v25.4s, v22.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[2]\n"
                    "fmla v28.4s, v24.4s, v10.s[2]\n"
                    "fmla v29.4s, v24.4s, v14.s[2]\n"
                    "fmla v30.4s, v24.4s, v18.s[2]\n"
                    "fmla v31.4s, v24.4s, v22.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[3]\n"
                    "fmla v27.4s, v25.4s, v6.s[3]\n"
                    "fmla v28.4s, v25.4s, v10.s[3]\n"
                    "fmla v29.4s, v25.4s, v14.s[3]\n"
                    "fmla v30.4s, v25.4s, v18.s[3]\n"
                    "fmla v31.4s, v25.4s, v22.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v3.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v7.s[0]\n"
                    "fmla v28.4s, v24.4s, v11.s[0]\n"
                    "fmla v29.4s, v24.4s, v15.s[0]\n"
                    "fmla v30.4s, v24.4s, v19.s[0]\n"
                    "fmla v31.4s, v24.4s, v23.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v3.s[1]\n"
                    "fmla v27.4s, v25.4s, v7.s[1]\n"
                    "fmla v28.4s, v25.4s, v11.s[1]\n"
                    "fmla v29.4s, v25.4s, v15.s[1]\n"
                    "fmla v30.4s, v25.4s, v19.s[1]\n"
                    "fmla v31.4s, v25.4s, v23.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v3.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v7.s[2]\n"
                    "fmla v28.4s, v24.4s, v11.s[2]\n"
                    "fmla v29.4s, v24.4s, v15.s[2]\n"
                    "fmla v30.4s, v24.4s, v19.s[2]\n"
                    "fmla v31.4s, v24.4s, v23.s[2]\n"
                    "fmla v26.4s, v25.4s, v3.s[3]\n"
                    "fmla v27.4s, v25.4s, v7.s[3]\n"
                    "fmla v28.4s, v25.4s, v11.s[3]\n"
                    "fmla v29.4s, v25.4s, v15.s[3]\n"
                    "fmla v30.4s, v25.4s, v19.s[3]\n"
                    "fmla v31.4s, v25.4s, v23.s[3]\n"
                    "b.eq 3f\n"
                    "4:\n"
                    "ld1r {v24.4s}, [%[minptr]]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "ld1r {v25.4s}, [%[maxptr]]\n"
                    "fmax v26.4s, v26.4s, v24.4s\n"
                    "fmax v27.4s, v27.4s, v24.4s\n"
                    "fmax v28.4s, v28.4s, v24.4s\n"
                    "fmax v29.4s, v29.4s, v24.4s\n"
                    "fmin v26.4s, v26.4s, v25.4s\n"
                    "fmin v27.4s, v27.4s, v25.4s\n"
                    "fmin v28.4s, v28.4s, v25.4s\n"
                    "fmin v29.4s, v29.4s, v25.4s\n"
                    "str q26, [%[c_ptr0]]\n"
                    "fmax v30.4s, v30.4s, v24.4s\n"
                    "ldr q26, [%[biasptr]]\n"
                    "fmax v31.4s, v31.4s, v24.4s\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "str q27, [c_ptr1]\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v30.4s, v30.4s, v25.4s\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmin v31.4s, v31.4s, v25.4s\n"
                    "str q28, [c_ptr2]\n"
                    "mov v27.16b, v26.16b\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "mov v28.16b, v26.16b\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "str q29, [c_ptr3]\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "mov v29.16b, v26.16b\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v4.s[0]\n"
                    "str q30, [c_ptr4]\n"
                    "mov v30.16b, v26.16b\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "fmla v28.4s, v24.4s, v8.s[0]\n"
                    "str q31, [c_ptr5]\n"
                    "mov v31.16b, v26.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "fmla v26.4s, v24.4s, v0.s[0]\n"
                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                    "fmla v29.4s, v24.4s, v12.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                    "fmla v30.4s, v24.4s, v16.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                    "fmla v31.4s, v24.4s, v20.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                    "fmla v27.4s, v25.4s, v4.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                    "fmla v28.4s, v25.4s, v8.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                    "fmla v29.4s, v25.4s, v12.s[1]\n"
                    "fmla v30.4s, v25.4s, v16.s[1]\n"
                    "fmla v31.4s, v25.4s, v20.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v0.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v8.s[2]\n"
                    "fmla v29.4s, v24.4s, v12.s[2]\n"
                    "fmla v30.4s, v24.4s, v16.s[2]\n"
                    "fmla v31.4s, v24.4s, v20.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v8.s[3]\n"
                    "fmla v29.4s, v25.4s, v12.s[3]\n"
                    "fmla v30.4s, v25.4s, v16.s[3]\n"
                    "fmla v31.4s, v25.4s, v20.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[0]\n"
                    "fmla v28.4s, v24.4s, v9.s[0]\n"
                    "fmla v29.4s, v24.4s, v13.s[0]\n"
                    "fmla v30.4s, v24.4s, v17.s[0]\n"
                    "fmla v31.4s, v24.4s, v21.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[1]\n"
                    "fmla v27.4s, v25.4s, v5.s[1]\n"
                    "fmla v28.4s, v25.4s, v9.s[1]\n"
                    "fmla v29.4s, v25.4s, v13.s[1]\n"
                    "fmla v30.4s, v25.4s, v17.s[1]\n"
                    "fmla v31.4s, v25.4s, v21.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[2]\n"
                    "fmla v28.4s, v24.4s, v9.s[2]\n"
                    "fmla v29.4s, v24.4s, v13.s[2]\n"
                    "fmla v30.4s, v24.4s, v17.s[2]\n"
                    "fmla v31.4s, v24.4s, v21.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v5.s[3]\n"
                    "fmla v28.4s, v25.4s, v9.s[3]\n"
                    "fmla v29.4s, v25.4s, v13.s[3]\n"
                    "fmla v30.4s, v25.4s, v17.s[3]\n"
                    "fmla v31.4s, v25.4s, v21.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[0]\n"
                    "fmla v28.4s, v24.4s, v10.s[0]\n"
                    "fmla v29.4s, v24.4s, v14.s[0]\n"
                    "fmla v30.4s, v24.4s, v18.s[0]\n"
                    "fmla v31.4s, v24.4s, v22.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[1]\n"
                    "fmla v27.4s, v25.4s, v6.s[1]\n"
                    "fmla v28.4s, v25.4s, v10.s[1]\n"
                    "fmla v29.4s, v25.4s, v14.s[1]\n"
                    "fmla v30.4s, v25.4s, v18.s[1]\n"
                    "fmla v31.4s, v25.4s, v22.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[2]\n"
                    "fmla v28.4s, v24.4s, v10.s[2]\n"
                    "fmla v29.4s, v24.4s, v14.s[2]\n"
                    "fmla v30.4s, v24.4s, v18.s[2]\n"
                    "fmla v31.4s, v24.4s, v22.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[3]\n"
                    "fmla v27.4s, v25.4s, v6.s[3]\n"
                    "fmla v28.4s, v25.4s, v10.s[3]\n"
                    "fmla v29.4s, v25.4s, v14.s[3]\n"
                    "fmla v30.4s, v25.4s, v18.s[3]\n"
                    "fmla v31.4s, v25.4s, v22.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v3.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v7.s[0]\n"
                    "fmla v28.4s, v24.4s, v11.s[0]\n"
                    "fmla v29.4s, v24.4s, v15.s[0]\n"
                    "fmla v30.4s, v24.4s, v19.s[0]\n"
                    "fmla v31.4s, v24.4s, v23.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v3.s[1]\n"
                    "fmla v27.4s, v25.4s, v7.s[1]\n"
                    "fmla v28.4s, v25.4s, v11.s[1]\n"
                    "fmla v29.4s, v25.4s, v15.s[1]\n"
                    "fmla v30.4s, v25.4s, v19.s[1]\n"
                    "fmla v31.4s, v25.4s, v23.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v3.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v7.s[2]\n"
                    "fmla v28.4s, v24.4s, v11.s[2]\n"
                    "fmla v29.4s, v24.4s, v15.s[2]\n"
                    "fmla v30.4s, v24.4s, v19.s[2]\n"
                    "fmla v31.4s, v24.4s, v23.s[2]\n"
                    "fmla v26.4s, v25.4s, v3.s[3]\n"
                    "fmla v27.4s, v25.4s, v7.s[3]\n"
                    "fmla v28.4s, v25.4s, v11.s[3]\n"
                    "fmla v29.4s, v25.4s, v15.s[3]\n"
                    "fmla v30.4s, v25.4s, v19.s[3]\n"
                    "fmla v31.4s, v25.4s, v23.s[3]\n"
                    "b.ne 4b\n"
                    "3:\n"
                    "ld1r {v24.4s}, [%[minptr]]\n"
                    "ld1r {v25.4s}, [%[maxptr]]\n"
                    "fmax v26.4s, v26.4s, v24.4s\n"
                    "fmax v27.4s, v27.4s, v24.4s\n"
                    "fmax v28.4s, v28.4s, v24.4s\n"
                    "fmax v29.4s, v29.4s, v24.4s\n"
                    "fmin v26.4s, v26.4s, v25.4s\n"
                    "fmin v27.4s, v27.4s, v25.4s\n"
                    "fmin v28.4s, v28.4s, v25.4s\n"
                    "fmin v29.4s, v29.4s, v25.4s\n"
                    "str q26, [%[c_ptr0]]\n"
                    "fmax v30.4s, v30.4s, v24.4s\n"
                    "ldr q26, [%[biasptr]]\n"
                    "fmax v31.4s, v31.4s, v24.4s\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "str q27, [c_ptr1]\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v30.4s, v30.4s, v25.4s\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmin v31.4s, v31.4s, v25.4s\n"
                    "str q28, [c_ptr2]\n"
                    "mov v27.16b, v26.16b\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "mov v28.16b, v26.16b\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "str q29, [c_ptr3]\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "mov v29.16b, v26.16b\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v4.s[0]\n"
                    "str q30, [c_ptr4]\n"
                    "mov v30.16b, v26.16b\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "fmla v28.4s, v24.4s, v8.s[0]\n"
                    "str q31, [c_ptr5]\n"
                    "mov v31.16b, v26.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "fmla v26.4s, v24.4s, v0.s[0]\n"
                    "fmla v29.4s, v24.4s, v12.s[0]\n"
                    "fmla v30.4s, v24.4s, v16.s[0]\n"
                    "fmla v31.4s, v24.4s, v20.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[1]\n"
                    "fmla v27.4s, v25.4s, v4.s[1]\n"
                    "fmla v28.4s, v25.4s, v8.s[1]\n"
                    "fmla v29.4s, v25.4s, v12.s[1]\n"
                    "fmla v30.4s, v25.4s, v16.s[1]\n"
                    "fmla v31.4s, v25.4s, v20.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v0.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v8.s[2]\n"
                    "fmla v29.4s, v24.4s, v12.s[2]\n"
                    "fmla v30.4s, v24.4s, v16.s[2]\n"
                    "fmla v31.4s, v24.4s, v20.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v8.s[3]\n"
                    "fmla v29.4s, v25.4s, v12.s[3]\n"
                    "fmla v30.4s, v25.4s, v16.s[3]\n"
                    "fmla v31.4s, v25.4s, v20.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[0]\n"
                    "fmla v28.4s, v24.4s, v9.s[0]\n"
                    "fmla v29.4s, v24.4s, v13.s[0]\n"
                    "fmla v30.4s, v24.4s, v17.s[0]\n"
                    "fmla v31.4s, v24.4s, v21.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[1]\n"
                    "fmla v27.4s, v25.4s, v5.s[1]\n"
                    "fmla v28.4s, v25.4s, v9.s[1]\n"
                    "fmla v29.4s, v25.4s, v13.s[1]\n"
                    "fmla v30.4s, v25.4s, v17.s[1]\n"
                    "fmla v31.4s, v25.4s, v21.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[2]\n"
                    "fmla v28.4s, v24.4s, v9.s[2]\n"
                    "fmla v29.4s, v24.4s, v13.s[2]\n"
                    "fmla v30.4s, v24.4s, v17.s[2]\n"
                    "fmla v31.4s, v24.4s, v21.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v5.s[3]\n"
                    "fmla v28.4s, v25.4s, v9.s[3]\n"
                    "fmla v29.4s, v25.4s, v13.s[3]\n"
                    "fmla v30.4s, v25.4s, v17.s[3]\n"
                    "fmla v31.4s, v25.4s, v21.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[0]\n"
                    "fmla v28.4s, v24.4s, v10.s[0]\n"
                    "fmla v29.4s, v24.4s, v14.s[0]\n"
                    "fmla v30.4s, v24.4s, v18.s[0]\n"
                    "fmla v31.4s, v24.4s, v22.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[1]\n"
                    "fmla v27.4s, v25.4s, v6.s[1]\n"
                    "fmla v28.4s, v25.4s, v10.s[1]\n"
                    "fmla v29.4s, v25.4s, v14.s[1]\n"
                    "fmla v30.4s, v25.4s, v18.s[1]\n"
                    "fmla v31.4s, v25.4s, v22.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[2]\n"
                    "fmla v28.4s, v24.4s, v10.s[2]\n"
                    "fmla v29.4s, v24.4s, v14.s[2]\n"
                    "fmla v30.4s, v24.4s, v18.s[2]\n"
                    "fmla v31.4s, v24.4s, v22.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[3]\n"
                    "fmla v27.4s, v25.4s, v6.s[3]\n"
                    "fmla v28.4s, v25.4s, v10.s[3]\n"
                    "fmla v29.4s, v25.4s, v14.s[3]\n"
                    "fmla v30.4s, v25.4s, v18.s[3]\n"
                    "fmla v31.4s, v25.4s, v22.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v3.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v7.s[0]\n"
                    "fmla v28.4s, v24.4s, v11.s[0]\n"
                    "fmla v29.4s, v24.4s, v15.s[0]\n"
                    "fmla v30.4s, v24.4s, v19.s[0]\n"
                    "fmla v31.4s, v24.4s, v23.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v3.s[1]\n"
                    "fmla v27.4s, v25.4s, v7.s[1]\n"
                    "fmla v28.4s, v25.4s, v11.s[1]\n"
                    "fmla v29.4s, v25.4s, v15.s[1]\n"
                    "fmla v30.4s, v25.4s, v19.s[1]\n"
                    "fmla v31.4s, v25.4s, v23.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v3.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v7.s[2]\n"
                    "fmla v28.4s, v24.4s, v11.s[2]\n"
                    "fmla v29.4s, v24.4s, v15.s[2]\n"
                    "fmla v30.4s, v24.4s, v19.s[2]\n"
                    "fmla v31.4s, v24.4s, v23.s[2]\n"
                    "fmla v26.4s, v25.4s, v3.s[3]\n"
                    "fmla v27.4s, v25.4s, v7.s[3]\n"
                    "fmla v28.4s, v25.4s, v11.s[3]\n"
                    "fmla v29.4s, v25.4s, v15.s[3]\n"
                    "fmla v30.4s, v25.4s, v19.s[3]\n"
                    "fmla v31.4s, v25.4s, v23.s[3]\n"
                    "b 5f\n"
                    "2:\n"
                    "ldr q26, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "mov v27.16b, v26.16b\n"
                    "mov v28.16b, v26.16b\n"
                    "mov v29.16b, v26.16b\n"
                    "mov v30.16b, v26.16b\n"
                    "mov v31.16b, v26.16b\n"
                    "fmla v26.4s, v24.4s, v0.s[0]\n"
                    "fmla v27.4s, v24.4s, v4.s[0]\n"
                    "fmla v28.4s, v24.4s, v8.s[0]\n"
                    "fmla v29.4s, v24.4s, v12.s[0]\n"
                    "fmla v30.4s, v24.4s, v16.s[0]\n"
                    "fmla v31.4s, v24.4s, v20.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[1]\n"
                    "fmla v27.4s, v25.4s, v4.s[1]\n"
                    "fmla v28.4s, v25.4s, v8.s[1]\n"
                    "fmla v29.4s, v25.4s, v12.s[1]\n"
                    "fmla v30.4s, v25.4s, v16.s[1]\n"
                    "fmla v31.4s, v25.4s, v20.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v0.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v4.s[2]\n"
                    "fmla v28.4s, v24.4s, v8.s[2]\n"
                    "fmla v29.4s, v24.4s, v12.s[2]\n"
                    "fmla v30.4s, v24.4s, v16.s[2]\n"
                    "fmla v31.4s, v24.4s, v20.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v0.s[3]\n"
                    "fmla v27.4s, v25.4s, v4.s[3]\n"
                    "fmla v28.4s, v25.4s, v8.s[3]\n"
                    "fmla v29.4s, v25.4s, v12.s[3]\n"
                    "fmla v30.4s, v25.4s, v16.s[3]\n"
                    "fmla v31.4s, v25.4s, v20.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[0]\n"
                    "fmla v28.4s, v24.4s, v9.s[0]\n"
                    "fmla v29.4s, v24.4s, v13.s[0]\n"
                    "fmla v30.4s, v24.4s, v17.s[0]\n"
                    "fmla v31.4s, v24.4s, v21.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[1]\n"
                    "fmla v27.4s, v25.4s, v5.s[1]\n"
                    "fmla v28.4s, v25.4s, v9.s[1]\n"
                    "fmla v29.4s, v25.4s, v13.s[1]\n"
                    "fmla v30.4s, v25.4s, v17.s[1]\n"
                    "fmla v31.4s, v25.4s, v21.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v1.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v5.s[2]\n"
                    "fmla v28.4s, v24.4s, v9.s[2]\n"
                    "fmla v29.4s, v24.4s, v13.s[2]\n"
                    "fmla v30.4s, v24.4s, v17.s[2]\n"
                    "fmla v31.4s, v24.4s, v21.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v1.s[3]\n"
                    "fmla v27.4s, v25.4s, v5.s[3]\n"
                    "fmla v28.4s, v25.4s, v9.s[3]\n"
                    "fmla v29.4s, v25.4s, v13.s[3]\n"
                    "fmla v30.4s, v25.4s, v17.s[3]\n"
                    "fmla v31.4s, v25.4s, v21.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[0]\n"
                    "fmla v28.4s, v24.4s, v10.s[0]\n"
                    "fmla v29.4s, v24.4s, v14.s[0]\n"
                    "fmla v30.4s, v24.4s, v18.s[0]\n"
                    "fmla v31.4s, v24.4s, v22.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[1]\n"
                    "fmla v27.4s, v25.4s, v6.s[1]\n"
                    "fmla v28.4s, v25.4s, v10.s[1]\n"
                    "fmla v29.4s, v25.4s, v14.s[1]\n"
                    "fmla v30.4s, v25.4s, v18.s[1]\n"
                    "fmla v31.4s, v25.4s, v22.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v2.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v6.s[2]\n"
                    "fmla v28.4s, v24.4s, v10.s[2]\n"
                    "fmla v29.4s, v24.4s, v14.s[2]\n"
                    "fmla v30.4s, v24.4s, v18.s[2]\n"
                    "fmla v31.4s, v24.4s, v22.s[2]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v2.s[3]\n"
                    "fmla v27.4s, v25.4s, v6.s[3]\n"
                    "fmla v28.4s, v25.4s, v10.s[3]\n"
                    "fmla v29.4s, v25.4s, v14.s[3]\n"
                    "fmla v30.4s, v25.4s, v18.s[3]\n"
                    "fmla v31.4s, v25.4s, v22.s[3]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v3.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v7.s[0]\n"
                    "fmla v28.4s, v24.4s, v11.s[0]\n"
                    "fmla v29.4s, v24.4s, v15.s[0]\n"
                    "fmla v30.4s, v24.4s, v19.s[0]\n"
                    "fmla v31.4s, v24.4s, v23.s[0]\n"
                    "ldr q24, [%[b_ptr0]]\n"
                    "fmla v26.4s, v25.4s, v3.s[1]\n"
                    "fmla v27.4s, v25.4s, v7.s[1]\n"
                    "fmla v28.4s, v25.4s, v11.s[1]\n"
                    "fmla v29.4s, v25.4s, v15.s[1]\n"
                    "fmla v30.4s, v25.4s, v19.s[1]\n"
                    "fmla v31.4s, v25.4s, v23.s[1]\n"
                    "ldr q25, [%[b_ptr0], #0x10]\n"
                    "fmla v26.4s, v24.4s, v3.s[2]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmla v27.4s, v24.4s, v7.s[2]\n"
                    "fmla v28.4s, v24.4s, v11.s[2]\n"
                    "fmla v29.4s, v24.4s, v15.s[2]\n"
                    "fmla v30.4s, v24.4s, v19.s[2]\n"
                    "fmla v31.4s, v24.4s, v23.s[2]\n"
                    "fmla v26.4s, v25.4s, v3.s[3]\n"
                    "fmla v27.4s, v25.4s, v7.s[3]\n"
                    "fmla v28.4s, v25.4s, v11.s[3]\n"
                    "fmla v29.4s, v25.4s, v15.s[3]\n"
                    "fmla v30.4s, v25.4s, v19.s[3]\n"
                    "fmla v31.4s, v25.4s, v23.s[3]\n"
                    "5:\n"
                    "ld1r {v24.4s}, [%[minptr]]\n"
                    "ld1r {v25.4s}, [%[maxptr]]\n"
                    "fmax v26.4s, v26.4s, v24.4s\n"
                    "fmax v27.4s, v27.4s, v24.4s\n"
                    "fmax v28.4s, v28.4s, v24.4s\n"
                    "fmax v29.4s, v29.4s, v24.4s\n"
                    "fmin v26.4s, v26.4s, v25.4s\n"
                    "fmin v27.4s, v27.4s, v25.4s\n"
                    "fmin v28.4s, v28.4s, v25.4s\n"
                    "fmin v29.4s, v29.4s, v25.4s\n"
                    "str q26, [%[c_ptr0]]\n"
                    "fmax v30.4s, v30.4s, v24.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmax v31.4s, v31.4s, v24.4s\n"
                    "str q27, [c_ptr1]\n"
                    "fmin v30.4s, v30.4s, v25.4s\n"
                    "fmin v31.4s, v31.4s, v25.4s\n"
                    "str q28, [c_ptr2]\n"
                    "str q29, [c_ptr3]\n"
                    "str q30, [c_ptr4]\n"
                    "str q31, [c_ptr5]\n"
                    ".unreq a_ptr1\n"
                    ".unreq a_ptr2\n"
                    ".unreq a_ptr3\n"
                    ".unreq a_ptr4\n"
                    ".unreq a_ptr5\n"
                    ".unreq c_ptr1\n"
                    ".unreq c_ptr2\n"
                    ".unreq c_ptr3\n"
                    ".unreq c_ptr4\n"
                    ".unreq c_ptr5\n"
                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [biasptr] "+r" (biasptr)
                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
                );
                break;
        }
    }
}

} // namespace arm_gemm

#endif // __aarch64__
